pdf 处理

2025-04-22 04:10:25 +08:00 · 2023-07-28 14:45:01 +08:00 · 2023-07-28 14:45:01 +08:00 · 9210226e28
commit 9210226e28
parent c7d2099d3b
3 changed files with 29 additions and 10 deletions
--- a/pdf处理/main.py
+++ b/pdf处理/main.py
@ -0,0 +1,4 @@
+from program.discern import Discern
+
+discern = Discern()
+discern.run('./file_test')
--- a/pdf处理/program/discern.py
+++ b/pdf处理/program/discern.py
@ -1,10 +1,11 @@
 import pandas as pd
+from PIL import Image
 import pdfplumber
 import re
 import PyPDF2
 from datetime import datetime
 from base import Base
-from image_text_ocr import ImageTextOcr
+from program.image_text_ocr import ImageTextOcr
 import os
 import cv2

@ -23,7 +24,7 @@ class Discern(Base):
        pf = pd.DataFrame(list(export))
        current_time = datetime.now()
        formatted_time = current_time.strftime('%Y-%m-%d-%H-%M-%S')
-        file_path = pd.ExcelWriter(f'../docs/无源{formatted_time}.xlsx')
+        file_path = pd.ExcelWriter(f'./docs/无源{formatted_time}.xlsx')
        # 替换空单元格
        pf.fillna(' ', inplace=True)
        # 输出
@ -87,15 +88,28 @@ class Discern(Base):
                if '公司' in line:
                    company += line.strip()
                if '最终报告' in line or 'Final Report' in line:
-                    self.xlsx_keys['检测项目'] = line_str.replace('最终报告', '').replace('Final Report', '')\
+                    self.xlsx_keys['检测项目'] = line_str.replace('最终报告', '').replace('Final Report', '') \
                        .replace('中国认可国际互认检测', '')

            company_list = company.split()
            for company_str in company_list:
                self.xlsx_keys['公司名称'] += company_str
-                self.xlsx_keys['公司名称'] = self.xlsx_keys['公司名称'].replace('中检华通威国际检验(苏州)有限公司', '').\
+                self.xlsx_keys['公司名称'] = self.xlsx_keys['公司名称'].replace('中检华通威国际检验(苏州)有限公司', ''). \
                    replace('中检华通威国际检验（苏州）有限公司', '')

+    def processing_image(self, img_file, standard=205):
+        """ 1.将图片进行降噪处理, 通过二值化去掉后面的背景色并加深文字对比度 """
+        img = Image.open(img_file)
+        img = img.convert('L')
+        pixels = img.load()
+        for x in range(img.width):
+            for y in range(img.height):
+                if pixels[x, y] > standard:
+                    pixels[x, y] = 255
+                else:
+                    pixels[x, y] = 0
+        img.save(img_file)
+
    def pdf_images(self, pdf_path):
        self.num = 0
        pdf_reader = PyPDF2.PdfReader(pdf_path)
@ -105,11 +119,12 @@ class Discern(Base):

            for obj in xObject:
                if xObject[obj]['/Subtype'] == '/Image':
-                    size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
                    self.num += 1
-                    image_file = f"../target_img/image_{self.num}.png"
+                    image_file = f"./target_img/image_{self.num}.png"
                    with open(image_file, "wb") as f:
                        f.write(xObject[obj].get_data())
+                    if page_num != 0:
+                        self.processing_image(image_file)

    def get_images_text(self):
        for i in range(1, self.num + 1):
@ -119,7 +134,7 @@ class Discern(Base):
                '标志': ''
            }
            try:
-                text_dict = self.image_text_ocr.run(text_dict, f'../target_img/image_{i}.png')
+                text_dict = self.image_text_ocr.run(text_dict, f'./target_img/image_{i}.png')
            except cv2.error as c:
                self.log(c)
                pass
@ -155,7 +170,7 @@ class Discern(Base):
                    '签发日期': '',
                    '公司名称': ''
                }
-                self.remove_file('../target_img')
+                self.remove_file('./target_img')
                if entry.is_file():
                    file_path = entry.path
                    file_name = entry.name
@ -177,4 +192,4 @@ class Discern(Base):

 if __name__ == '__main__':
    discern = Discern()
-    discern.run('../file')
+    discern.run('./file_test')
--- a/pdf处理/program/image_text_ocr.py
+++ b/pdf处理/program/image_text_ocr.py
@ -61,5 +61,5 @@ if __name__ == '__main__':
        '签发日期': '',
        '标志': ''
    }
-    res = image_text_ocr.run(text_dict, '../target_img/image_5.png')
+    res = image_text_ocr.run(text_dict, '../target_img/image_2.jpg')
    print(res)