pdf 处理

2025-04-20 01:34:55 +08:00 · 2023-07-16 23:06:51 +08:00 · 2023-07-16 23:06:51 +08:00 · c2e2d356d6
commit c2e2d356d6
parent 0400aaeebe
3 changed files with 34 additions and 0 deletions
--- a/pdf处理/main.py
+++ b/pdf处理/main.py
--- a/pdf处理/program/init.py
+++ b/pdf处理/program/init.py
--- a/pdf处理/program/discern.py
+++ b/pdf处理/program/discern.py
@ -0,0 +1,34 @@
+import pdfplumber
+
+
+class Discern(object):
+
+    def __init__(self):
+        self.xlsx_table = {}
+
+    def pdf_text(self, path_pdf):
+        with pdfplumber.open(path_pdf) as pdf:
+            # 遍历每个页面
+            for page in pdf.pages:
+                # 提取页面文本
+                text = page.extract_text()
+                lines = text.split("\n")
+                print(lines)
+                for line in lines:
+                    # print(line)
+                    if '报告编号:' in line:
+                        self.xlsx_table['报告编号'] = line.split("报告编号")[1].strip()
+                    if '样品名称:' in line:
+                        self.xlsx_table['样品名称'] = line.split("样品名称")[1].strip()
+                    if '公司' in line and '中检华通' not in line:
+                        print(line)
+                        self.xlsx_table['公司名称'] = line.strip()
+            print(self.xlsx_table)
+
+    def run(self):
+        self.pdf_text('../docs/CSTBB23040149+SST2304000301BB+英姿医疗科技(湖南)有限公司+宫腔内窥镜+低温等离子灭菌+耐受性试验（中文）.pdf')
+
+
+if __name__ == '__main__':
+    discern = Discern()
+    discern.run()