pdf处理

2025-04-16 16:07:15 +08:00 · 2023-07-18 02:09:24 +08:00 · 2023-07-18 02:09:24 +08:00 · 5b0e238840
commit 5b0e238840
parent c1afbed128
1 changed files with 27 additions and 12 deletions
--- a/pdf处理/program/discern.py
+++ b/pdf处理/program/discern.py
@ -1,32 +1,47 @@
 import pdfplumber
+from PIL import Image


 class Discern(object):

    def __init__(self):
-        self.xlsx_table = {}
+        self.xlsx_keys = {}

-    def pdf_text(self, path_pdf):
-        with pdfplumber.open(path_pdf) as pdf:
+    def pdf_text(self, pdf_path):
+        with pdfplumber.open(pdf_path) as pdf:
            # 遍历每个页面
            for page in pdf.pages:
                # 提取页面文本
                text = page.extract_text()
                lines = text.split("\n")
-                print(lines)
+                line_str = ''
                for line in lines:
-                    # print(line)
+                    line_str += line
                    if '报告编号:' in line:
-                        self.xlsx_table['报告编号'] = line.split("报告编号")[1].strip()
+                        self.xlsx_keys['报告编号'] = line.split("报告编号")[1].strip()
                    if '样品名称:' in line:
-                        self.xlsx_table['样品名称'] = line.split("样品名称")[1].strip()
-                    if '公司' in line and '中检华通' not in line:
-                        print(line)
-                        self.xlsx_table['公司名称'] = line.strip()
-            print(self.xlsx_table)
+                        self.xlsx_keys['样品名称'] = line.split("样品名称")[1].strip()
+                    if '公司' in line and '中检华通' not in line and '制造商' not in line:
+                        self.xlsx_keys['公司名称'] = line.strip()
+                    if '最终报告' in line:
+                        self.xlsx_keys['检测项目'] = line_str.replace('最终报告', '')
+
+    def pdf_images(self, pdf_path):
+        i = 0
+        with pdfplumber.open(pdf_path) as pdf:
+            for page in pdf.pages:
+                images = page.images
+                for img in images:
+                    # 获取图片的二进制流
+                    i += 1
+                    image_file = f"../img/image_{i}.png"
+                    with open(image_file, "wb") as f:
+                        f.write(img['stream'].get_data())

    def run(self):
-        self.pdf_text('../docs/CSTBB23040149+SST2304000301BB+英姿医疗科技(湖南)有限公司+宫腔内窥镜+低温等离子灭菌+耐受性试验（中文）.pdf')
+        self.pdf_text('../docs/2.pdf')
+        self.pdf_images('../docs/2.pdf')
+        print(self.xlsx_keys)


 if __name__ == '__main__':