pdf处理

2025-04-12 11:37:09 +08:00 · 2023-07-26 01:16:23 +08:00 · 2023-07-26 01:16:23 +08:00 · d96dde1f9c
commit d96dde1f9c
parent df6cf1faa5
3 changed files with 67 additions and 23 deletions
--- a/pdf处理/base.py
+++ b/pdf处理/base.py
@ -0,0 +1,10 @@
+from datetime import datetime
+
+
+class Base(object):
+
+    def __init__(self):
+        pass
+
+    def log(self, s):
+        print('【%s】 %s' % (datetime.now(), s), flush=True)
--- a/pdf处理/program/discern.py
+++ b/pdf处理/program/discern.py
@ -2,14 +2,16 @@ import pandas as pd
 import pdfplumber
 import PyPDF2
 from datetime import datetime
+from base import Base
 from image_text_ocr import ImageTextOcr
 import os
 import cv2


-class Discern(object):
+class Discern(Base):

    def __init__(self):
+        super(Discern, self).__init__()
        self.image_text_ocr = ImageTextOcr()
        self.xlsx_keys = {}
        self.xlsx_keys_list = []
@ -19,7 +21,7 @@ class Discern(object):
        # 将字典列表转换为DataFrame
        pf = pd.DataFrame(list(export))
        current_time = datetime.now()
-        formatted_time = current_time.strftime('%Y-%m-%d %H:%M:%S')
+        formatted_time = current_time.strftime('%Y-%m-%d-%H-%M-%S')
        file_path = pd.ExcelWriter(f'../docs/{formatted_time}.xlsx')
        # 替换空单元格
        pf.fillna(' ', inplace=True)
@ -28,14 +30,38 @@ class Discern(object):
        # 保存表格
        file_path.close()

+    def is_valid_time(self, input_str):
+        try:
+            valid_time = datetime.strptime(input_str, "%Y-%m-%d")  # 根据实际时间格式调整
+            return valid_time
+        except ValueError:
+            return False
+
+    def pdf_all_text(self, pdf_path):
+        with pdfplumber.open(pdf_path) as pdf:
+            for page in pdf.pages[1:]:
+                # 提取页面文本
+                text = page.extract_text()
+                lines = text.split()
+                valid_time_list = []
+                for line in lines:
+                    if 'SST' in line:
+                        self.xlsx_keys['方案编号'] = line
+
+                    valid_time = self.is_valid_time(line)
+                    if valid_time:
+                        valid_time_list.append(valid_time)
+                if valid_time_list:
+                    self.xlsx_keys['签发日期'] = max(valid_time_list).strftime("%Y-%m-%d")
+
    def pdf_text(self, pdf_path):
        with pdfplumber.open(pdf_path) as pdf:
-            # 遍历每个页面
            page = pdf.pages[0]
            # 提取页面文本
            text = page.extract_text()
            lines = text.split("\n")
            line_str = ''
+            company = ''
            for line in lines:
                line_str += line
                if '报告编号:' in line:
@ -46,12 +72,17 @@ class Discern(object):
                    self.xlsx_keys['样品名称'] = line.split("样品名称")[1].strip().replace(': ', '')
                if 'Article Name:' in line:
                    self.xlsx_keys['样品名称'] = line.split("Article Name")[1].strip().replace(': ', '')
-                if '公司' in line and '中检华通' not in line and '制造商' not in line:
-                    self.xlsx_keys['公司名称'] = line.strip()
+                if '公司' in line:
+                    company += line.strip()
                if '最终报告' in line:
                    self.xlsx_keys['检测项目'] = line_str.replace('最终报告', '')
                self.xlsx_keys['标志'] = ''

+            company_list = company.split(" ")
+            for company_str in company_list:
+                if '中检华通' not in company_str and '制造商' not in company_str:
+                    self.xlsx_keys['公司名称'] += company_str
+
    def pdf_images(self, pdf_path):
        self.num = 0
        pdf_reader = PyPDF2.PdfReader(pdf_path)
@ -77,7 +108,7 @@ class Discern(object):
            try:
                text_dict = self.image_text_ocr.run(text_dict, f'../target_img/image_{i}.png')
            except cv2.error as c:
-                print(c)
+                self.log(c)
                pass

            if text_dict.get('标志'):
@ -115,10 +146,13 @@ class Discern(object):
                if entry.is_file():
                    file_path = entry.path
                    file_name = entry.name
+                    self.log(file_name)
                    self.xlsx_keys['文件名'] = file_name
                    self.pdf_text(file_path)
                    self.pdf_images(file_path)
                    self.get_images_text()
+                    if not self.xlsx_keys['方案编号'] and not self.xlsx_keys['签发日期']:
+                        self.pdf_all_text(file_path)
                self.xlsx_keys_list.append(self.xlsx_keys)
        self.export_excel(self.xlsx_keys_list)

--- a/pdf处理/program/image_text_ocr.py
+++ b/pdf处理/program/image_text_ocr.py
@ -1,20 +1,22 @@
 from datetime import datetime
+from base import Base
 from PIL import Image
 import pytesseract
 import platform


-class ImageTextOcr(object):
+class ImageTextOcr(Base):

    def __init__(self):
+        super(ImageTextOcr, self).__init__()
        current_os = platform.system()
        if current_os == 'Windows':
            pytesseract.pytesseract.tesseract_cmd = r'E:\pc\tesseract-ocr\tesseract.exe'
-            print("当前操作系统是 Windows")
+            self.log("当前操作系统是 Windows")
        elif current_os == 'Linux':
-            print("当前操作系统是 Ubuntu")
+            self.log("当前操作系统是 Ubuntu")
        else:
-            print(f"当前操作系统是 {current_os}")
+            self.log(f"当前操作系统是 {current_os}")

    def is_valid_time(self, input_str):
        try:
@ -28,22 +30,20 @@ class ImageTextOcr(object):
        # 加载图像
        image = Image.open(path)
        result = pytesseract.image_to_string(image, config=r'--oem 3 --psm 6 -l chi_sim+eng')
-        lines = result.split('\n')
+        lines = result.split()
        for line in lines:
-            data_list = line.split(' ')
-            for data in data_list:
-                if 'S$T' in data or 'SST' in data:
-                    text_dict['方案编号'] = data.replace('S$T', 'SST')
+            if 'S$T' in line or 'SST' in line:
+                text_dict['方案编号'] = line.replace('S$T', 'SST').replace('试验方案编号:', '')

-                if 'CNAS' in data:
-                    text_dict['标志'] = 'cnas中文,'
+            if 'CNAS' in line:
+                text_dict['标志'] = 'cnas中文,'

-                if '200015344424' in data:
-                    text_dict['标志'] = '国cma,'
+            if '200015344424' in line:
+                text_dict['标志'] = '国cma,'

-                valid_time = self.is_valid_time(data)
-                if valid_time:
-                    valid_time_list.append(valid_time)
+            valid_time = self.is_valid_time(line)
+            if valid_time:
+                valid_time_list.append(valid_time)

        if valid_time_list:
            text_dict['签发日期'] = max(valid_time_list).strftime("%Y-%m-%d")
@ -61,5 +61,5 @@ if __name__ == '__main__':
        '签发日期': '',
        '标志': ''
    }
-    res = image_text_ocr.run(text_dict, '../target_img/image_3.png')
+    res = image_text_ocr.run(text_dict, '../target_img/image_5.png')
    print(res)