pdf 处理

2025-04-19 18:24:51 +08:00 · 2023-07-25 12:20:16 +08:00 · 2023-07-25 12:20:16 +08:00 · dc82b5c03d
commit dc82b5c03d
parent c6ddc2c570
2 changed files with 28 additions and 10 deletions
--- a/pdf处理/program/discern.py
+++ b/pdf处理/program/discern.py
@ -75,6 +75,9 @@ class Discern(object):
                cnas_flag = image_compare.run(f'../target_img/image_{i}.png', '../img/cnas.png')
                text_list = self.image_text_ocr.run(f'../target_img/image_{i}.png')
            except cv2.error as c:
+                cma_flag = ''
+                cnas_flag = ''
+                text_list = ''
                pass
            if cma_flag:
                self.xlsx_keys['标志'] = '国cma'
--- a/pdf处理/program/image_text_ocr.py
+++ b/pdf处理/program/image_text_ocr.py
@ -1,3 +1,4 @@
+from datetime import datetime
 from PIL import Image
 import pytesseract
 import platform
@ -15,27 +16,41 @@ class ImageTextOcr(object):
        else:
            print(f"当前操作系统是 {current_os}")

+    def is_valid_time(self, input_str):
+        try:
+            valid_time = datetime.strptime(input_str, "%Y-%m-%d")  # 根据实际时间格式调整
+            return valid_time
+        except ValueError:
+            return False
+
    def image_text_ocr(self, path):
        text_list = []
+        valid_time_list = []
        # 加载图像
        image = Image.open(path)
-        result = pytesseract.image_to_string(image, lang='chi_sim', output_type=pytesseract.Output.STRING)
+        result = pytesseract.image_to_string(image, config=r'--oem 3 --psm 6 -l chi_sim+eng')
        lines = result.split('\n')
        for line in lines:
            data_list = line.split(' ')
            for data in data_list:
-                if '试验方案编号' in data:
-                    text_list.append(data_list[-1])
-                if '报告完成日期' in data:
-                    text_list.append(data_list[-1])
-        return text_list
+                if 'S$T' in data or 'SST' in data:
+                    text_list.append(data.replace('S$T', 'SST'))
+                valid_time = self.is_valid_time(data)
+                if valid_time:
+                    valid_time_list.append(valid_time)
+        if valid_time_list:
+            text_list.append(max(valid_time_list).strftime("%Y-%m-%d"))
+        if len(text_list) == 2:
+            return text_list
+        else:
+            return []

    def run(self, path):
-        text_list = self.image_text_ocr(path)
-        return text_list
+        res_list = self.image_text_ocr(path)
+        return res_list


 if __name__ == '__main__':
    image_text_ocr = ImageTextOcr()
-    text_list = image_text_ocr.run('../target_img/image_7.jpg')
-    print(text_list)
+    res = image_text_ocr.run('../target_img/image_3.png')
+    print(res)