pdf处理

2025-04-20 10:25:01 +08:00 · 2023-07-18 02:09:24 +08:00 · 2023-07-18 02:09:24 +08:00 · 5b0e238840
commit 5b0e238840
parent c1afbed128
1 changed files with 27 additions and 12 deletions
--- a/pdf处理/program/discern.py
+++ b/pdf处理/program/discern.py
@ -1,32 +1,47 @@
 import pdfplumber
 from PIL import Image
 class Discern(object):
    def __init__(self):
-        self.xlsx_table = {}
+        self.xlsx_keys = {}
-    def pdf_text(self, path_pdf):
+    def pdf_text(self, pdf_path):
-        with pdfplumber.open(path_pdf) as pdf:
+        with pdfplumber.open(pdf_path) as pdf:
            # 遍历每个页面
            for page in pdf.pages:
                # 提取页面文本
                text = page.extract_text()
                lines = text.split("\n")
-                print(lines)
+                line_str = ''
                for line in lines:
-                    # print(line)
+                    line_str += line
                    if '报告编号:' in line:
-                        self.xlsx_table['报告编号'] = line.split("报告编号")[1].strip()
+                        self.xlsx_keys['报告编号'] = line.split("报告编号")[1].strip()
                    if '样品名称:' in line:
-                        self.xlsx_table['样品名称'] = line.split("样品名称")[1].strip()
+                        self.xlsx_keys['样品名称'] = line.split("样品名称")[1].strip()
-                    if '公司' in line and '中检华通' not in line:
+                    if '公司' in line and '中检华通' not in line and '制造商' not in line:
-                        print(line)
+                        self.xlsx_keys['公司名称'] = line.strip()
-                        self.xlsx_table['公司名称'] = line.strip()
+                    if '最终报告' in line:
-            print(self.xlsx_table)
+                        self.xlsx_keys['检测项目'] = line_str.replace('最终报告', '')
    def pdf_images(self, pdf_path):
        i = 0
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                images = page.images
                for img in images:
                    # 获取图片的二进制流
                    i += 1
                    image_file = f"../img/image_{i}.png"
                    with open(image_file, "wb") as f:
                        f.write(img['stream'].get_data())
    def run(self):
-        self.pdf_text('../docs/CSTBB23040149+SST2304000301BB+英姿医疗科技(湖南)有限公司+宫腔内窥镜+低温等离子灭菌+耐受性试验（中文）.pdf')
+        self.pdf_text('../docs/2.pdf')
        self.pdf_images('../docs/2.pdf')
        print(self.xlsx_keys)
 if __name__ == '__main__':