From 5b0e2388407e4a2f0d544ef627cd3c82f4cb8592 Mon Sep 17 00:00:00 2001
From: luzhisheng <aiyingfeng110@qq.com>
Date: Tue, 18 Jul 2023 02:09:24 +0800
Subject: [PATCH] =?UTF-8?q?pdf=E5=A4=84=E7=90=86?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pdf处理/program/discern.py | 39 ++++++++++++++++++++++++++------------
 1 file changed, 27 insertions(+), 12 deletions(-)

diff --git a/pdf处理/program/discern.py b/pdf处理/program/discern.py
index 9a42dc7..9b5214d 100644
--- a/pdf处理/program/discern.py
+++ b/pdf处理/program/discern.py
@@ -1,32 +1,47 @@
 import pdfplumber
+from PIL import Image
 
 
 class Discern(object):
 
     def __init__(self):
-        self.xlsx_table = {}
+        self.xlsx_keys = {}
 
-    def pdf_text(self, path_pdf):
-        with pdfplumber.open(path_pdf) as pdf:
+    def pdf_text(self, pdf_path):
+        with pdfplumber.open(pdf_path) as pdf:
             # 遍历每个页面
             for page in pdf.pages:
                 # 提取页面文本
                 text = page.extract_text()
                 lines = text.split("\n")
-                print(lines)
+                line_str = ''
                 for line in lines:
-                    # print(line)
+                    line_str += line
                     if '报告编号:' in line:
-                        self.xlsx_table['报告编号'] = line.split("报告编号")[1].strip()
+                        self.xlsx_keys['报告编号'] = line.split("报告编号")[1].strip()
                     if '样品名称:' in line:
-                        self.xlsx_table['样品名称'] = line.split("样品名称")[1].strip()
-                    if '公司' in line and '中检华通' not in line:
-                        print(line)
-                        self.xlsx_table['公司名称'] = line.strip()
-            print(self.xlsx_table)
+                        self.xlsx_keys['样品名称'] = line.split("样品名称")[1].strip()
+                    if '公司' in line and '中检华通' not in line and '制造商' not in line:
+                        self.xlsx_keys['公司名称'] = line.strip()
+                    if '最终报告' in line:
+                        self.xlsx_keys['检测项目'] = line_str.replace('最终报告', '')
+
+    def pdf_images(self, pdf_path):
+        i = 0
+        with pdfplumber.open(pdf_path) as pdf:
+            for page in pdf.pages:
+                images = page.images
+                for img in images:
+                    # 获取图片的二进制流
+                    i += 1
+                    image_file = f"../img/image_{i}.png"
+                    with open(image_file, "wb") as f:
+                        f.write(img['stream'].get_data())
 
     def run(self):
-        self.pdf_text('../docs/CSTBB23040149+SST2304000301BB+英姿医疗科技(湖南)有限公司+宫腔内窥镜+低温等离子灭菌+耐受性试验（中文）.pdf')
+        self.pdf_text('../docs/2.pdf')
+        self.pdf_images('../docs/2.pdf')
+        print(self.xlsx_keys)
 
 
 if __name__ == '__main__':