提取pdf文件中图片和文字

2025-04-20 03:59:57 +08:00 · 2023-08-01 14:24:53 +08:00 · 2023-08-01 14:24:53 +08:00 · b13155d7f2
commit b13155d7f2
parent 749cf70dcd
4 changed files with 176 additions and 182 deletions
--- a/pdf处理/main.py
+++ b/pdf处理/main.py
@ -1,4 +1,4 @@
-from program.discern import Discern
+from program.testing_agency_report import TestingAgencyReport

-discern = Discern()
-discern.run('./file_test')
+testing_agency_report = TestingAgencyReport()
+testing_agency_report.run('./file', './target_img', './docs')
--- a/pdf处理/program/image_text_ocr.py
+++ b/pdf处理/program/image_text_ocr.py
@ -1,65 +0,0 @@
-from datetime import datetime
-from base import Base
-from PIL import Image
-import pytesseract
-import platform
-
-
-class ImageTextOcr(Base):
-
-    def __init__(self):
-        super(ImageTextOcr, self).__init__()
-        current_os = platform.system()
-        if current_os == 'Windows':
-            pytesseract.pytesseract.tesseract_cmd = r'E:\pc\tesseract-ocr\tesseract.exe'
-            self.log("当前操作系统是 Windows")
-        elif current_os == 'Linux':
-            self.log("当前操作系统是 Ubuntu")
-        else:
-            self.log(f"当前操作系统是 {current_os}")
-
-    def is_valid_time(self, input_str):
-        try:
-            valid_time = datetime.strptime(input_str, "%Y-%m-%d")  # 根据实际时间格式调整
-            return valid_time
-        except ValueError:
-            return False
-
-    def image_text_ocr(self, text_dict, path):
-        valid_time_list = []
-        # 加载图像
-        image = Image.open(path)
-        result = pytesseract.image_to_string(image, config=r'--oem 3 --psm 6 -l chi_sim+eng')
-        lines = result.split()
-        for line in lines:
-            if 'S$T' in line or 'SST' in line:
-                text_dict['方案编号'] = line.replace('S$T', 'SST').replace('试验方案编号:', '')
-
-            if 'CNAS' in line:
-                text_dict['标志'] = 'cnas中文,'
-
-            if '200015344424' in line:
-                text_dict['标志'] = '国cma,'
-
-            valid_time = self.is_valid_time(line)
-            if valid_time:
-                valid_time_list.append(valid_time)
-
-        if valid_time_list:
-            text_dict['签发日期'] = max(valid_time_list).strftime("%Y-%m-%d")
-        return text_dict
-
-    def run(self, text_dict, path):
-        res_list = self.image_text_ocr(text_dict, path)
-        return res_list
-
-
-if __name__ == '__main__':
-    image_text_ocr = ImageTextOcr()
-    text_dict = {
-        '方案编号': '',
-        '签发日期': '',
-        '标志': ''
-    }
-    res = image_text_ocr.run(text_dict, '../target_img/output_image-003.png')
-    print(res)
--- a/pdf处理/program/pdf_base.py
+++ b/pdf处理/program/pdf_base.py
@ -0,0 +1,101 @@
+from datetime import datetime
+from base import Base
+import subprocess
+from PIL import Image
+import pandas as pd
+import pytesseract
+import platform
+import os
+
+
+class PDFBase(Base):
+
+    def __init__(self):
+        super(Base, self).__init__()
+        current_os = platform.system()
+        if current_os == 'Windows':
+            pytesseract.pytesseract.tesseract_cmd = r'E:\pc\tesseract-ocr\tesseract.exe'
+            self.log("当前操作系统是 Windows")
+        elif current_os == 'Linux':
+            self.log("当前操作系统是 Ubuntu")
+        else:
+            self.log(f"当前操作系统是 {current_os}")
+
+    def download_img(self, input_pdf, output_image):
+        """
+        下载pdf中全部图片
+        :param input_pdf:
+        :param output_image:
+        :return:
+        """
+        try:
+            subprocess.run(f"pdfimages -png '{input_pdf}' {output_image}/image", shell=True,
+                           capture_output=True, text=True)
+        except Exception as e:
+            self.log(f"出现异常：{e}")
+
+    @staticmethod
+    def read_img_ocr(img_path):
+        """
+        读取图片中文字内容
+        :param img_path:
+        :return:
+        """
+        image = Image.open(img_path)
+        result = pytesseract.image_to_string(image, config=r'--oem 3 --psm 6 -l chi_sim+eng')
+        lines = result.split()
+        return lines
+
+    def remove_img(self, img_path):
+        """
+        删除当前文件夹下所有的图片
+        :param img_path:
+        :return:
+        """
+        with os.scandir(img_path) as entries:
+            for entry in entries:
+                if entry.is_file():
+                    file_path = entry.path
+                    try:
+                        os.remove(file_path)
+                    except Exception as e:
+                        self.log(f"错误信息：{e}")
+
+    @staticmethod
+    def is_valid_time(input_str):
+        """
+        判断是否是时间格式
+        :param input_str:
+        :return:
+        """
+        try:
+            valid_time = datetime.strptime(input_str, "%Y-%m-%d")
+            return valid_time
+        except ValueError:
+            return False
+
+    @staticmethod
+    def export_excel(export, excel_path):
+        """
+        将字典列表转换为DataFrame
+        :param export:
+        :return:
+        """
+        pf = pd.DataFrame(list(export))
+        current_time = datetime.now()
+        formatted_time = current_time.strftime('%Y-%m-%d-%H-%M-%S')
+        file_path = pd.ExcelWriter(f'{excel_path}/无源{formatted_time}.xlsx')
+        # 替换空单元格
+        pf.fillna(' ', inplace=True)
+        # 输出
+        pf = pf.sort_values(by='样品名称')
+        pf.to_excel(file_path, index=False)
+        # 保存表格
+        file_path.close()
+
+
+if __name__ == '__main__':
+    pdf_base = PDFBase()
+    # pdf_base.download_img('../file_test/1.pdf', '../target_img/')
+    res = pdf_base.read_img_ocr('../target_img/image-017.png')
+    print(res)
--- a/pdf处理/program/testing_agency_report.py
+++ b/pdf处理/program/testing_agency_report.py
@ -1,67 +1,56 @@
-import pandas as pd
-from PIL import Image
+from program.pdf_base import PDFBase
 import pdfplumber
-import re
-import PyPDF2
-from datetime import datetime
-from base import Base
-from program.image_text_ocr import ImageTextOcr
-import os
 import cv2
+import os
+import re


-class Discern(Base):
+class TestingAgencyReport(PDFBase):

    def __init__(self):
-        super(Discern, self).__init__()
-        self.image_text_ocr = ImageTextOcr()
+        super(TestingAgencyReport, self).__init__()
        self.xlsx_keys = {}
        self.xlsx_keys_list = []
        self.num = 0

-    def export_excel(self, export):
-        # 将字典列表转换为DataFrame
-        pf = pd.DataFrame(list(export))
-        current_time = datetime.now()
-        formatted_time = current_time.strftime('%Y-%m-%d-%H-%M-%S')
-        file_path = pd.ExcelWriter(f'./docs/无源{formatted_time}.xlsx')
-        # 替换空单元格
-        pf.fillna(' ', inplace=True)
-        # 输出
-        pf = pf.sort_values(by='样品名称')
-        pf.to_excel(file_path, index=False)
-        # 保存表格
-        file_path.close()
+    def pdf_images(self, pdf_path, img_path):
+        self.download_img(pdf_path, img_path)
+        with os.scandir(img_path) as entries:
+            for entry in entries:
+                if entry.is_file():
+                    text_dict = {
+                        '方案编号': '',
+                        '签发日期': '',
+                        '标志': ''
+                    }

-    def is_valid_time(self, input_str):
-        try:
-            valid_time = datetime.strptime(input_str, "%Y-%m-%d")  # 根据实际时间格式调整
-            return valid_time
-        except ValueError:
-            return False
+                    try:
+                        lines = self.read_img_ocr(entry.path)
+                        valid_time_list = []
+                        for line in lines:
+                            if 'S$T' in line or 'SST' in line:
+                                text_dict['方案编号'] = line.replace('S$T', 'SST').replace('试验方案编号:', '')

-    def pdf_all_text(self, pdf_path):
-        with pdfplumber.open(pdf_path) as pdf:
-            for page in pdf.pages[1:]:
-                # 提取页面文本
-                text = page.extract_text()
-                lines = text.split()
-                valid_time_list = []
-                for line in lines:
-                    if 'SST' in line and not self.xlsx_keys['方案编号']:
-                        self.xlsx_keys['方案编号'] = line
+                            if 'CNAS' in line:
+                                text_dict['标志'] = 'cnas中文,'

-                    if '签发日期' in line and not self.xlsx_keys['签发日期']:
-                        self.xlsx_keys['签发日期'] = line.replace('签发日期', '')
+                            if '200015344424' in line:
+                                text_dict['标志'] = '国cma,'

-                    if 'GLP' in line:
-                        self.xlsx_keys['标志'] += 'GLP,'
+                            valid_time = self.is_valid_time(line)
+                            if valid_time:
+                                valid_time_list.append(valid_time)
+                        if valid_time_list:
+                            text_dict['签发日期'] = max(valid_time_list).strftime("%Y-%m-%d")
+                    except cv2.error as c:
+                        self.log(c)

-                    valid_time = self.is_valid_time(line)
-                    if valid_time:
-                        valid_time_list.append(valid_time)
-                if valid_time_list:
-                    self.xlsx_keys['签发日期'] = max(valid_time_list).strftime("%Y-%m-%d")
+                    if text_dict.get('标志'):
+                        self.xlsx_keys['标志'] += text_dict.get('标志')
+                    if text_dict.get('方案编号'):
+                        self.xlsx_keys['方案编号'] = text_dict.get('方案编号')
+                    if text_dict.get('签发日期'):
+                        self.xlsx_keys['签发日期'] = text_dict.get('签发日期')

    def pdf_text(self, pdf_path):
        with pdfplumber.open(pdf_path) as pdf:
@ -97,67 +86,32 @@ class Discern(Base):
                self.xlsx_keys['公司名称'] = self.xlsx_keys['公司名称'].replace('中检华通威国际检验(苏州)有限公司', ''). \
                    replace('中检华通威国际检验（苏州）有限公司', '')

-    def processing_image(self, img_file, standard=205):
-        """ 1.将图片进行降噪处理, 通过二值化去掉后面的背景色并加深文字对比度 """
-        img = Image.open(img_file)
-        img = img.convert('L')
-        pixels = img.load()
-        for x in range(img.width):
-            for y in range(img.height):
-                if pixels[x, y] > standard:
-                    pixels[x, y] = 255
-                else:
-                    pixels[x, y] = 0
-        img.save(img_file)
+    def pdf_all_text(self, pdf_path):
+        with pdfplumber.open(pdf_path) as pdf:
+            for page in pdf.pages[1:]:
+                # 提取页面文本
+                text = page.extract_text()
+                lines = text.split()
+                valid_time_list = []
+                for line in lines:

-    def pdf_images(self, pdf_path):
-        self.num = 0
-        pdf_reader = PyPDF2.PdfReader(pdf_path)
-        for page_num in range(len(pdf_reader.pages)):
-            page = pdf_reader.pages[page_num]
-            xObject = page['/Resources']['/XObject'].get_object()
+                    if 'SST' in line and not self.xlsx_keys['方案编号']:
+                        self.xlsx_keys['方案编号'] = line

-            for obj in xObject:
-                if xObject[obj]['/Subtype'] == '/Image':
-                    self.num += 1
-                    image_file = f"./target_img/image_{self.num}.png"
-                    with open(image_file, "wb") as f:
-                        f.write(xObject[obj].get_data())
-                    if page_num != 0:
-                        self.processing_image(image_file)
+                    if '签发日期' in line and not self.xlsx_keys['签发日期']:
+                        self.xlsx_keys['签发日期'] = line.replace('签发日期', '')

-    def get_images_text(self):
-        for i in range(1, self.num + 1):
-            text_dict = {
-                '方案编号': '',
-                '签发日期': '',
-                '标志': ''
-            }
-            try:
-                text_dict = self.image_text_ocr.run(text_dict, f'./target_img/image_{i}.png')
-            except cv2.error as c:
-                self.log(c)
-                pass
+                    if 'GLP' in line:
+                        self.xlsx_keys['标志'] += 'GLP,'

-            if text_dict.get('标志'):
-                self.xlsx_keys['标志'] += text_dict.get('标志')
-            if text_dict.get('方案编号'):
-                self.xlsx_keys['方案编号'] = text_dict.get('方案编号')
-            if text_dict.get('签发日期'):
-                self.xlsx_keys['签发日期'] = text_dict.get('签发日期')
+                    valid_time = self.is_valid_time(line)
+                    if valid_time:
+                        valid_time_list.append(valid_time)
+                if valid_time_list:
+                    self.xlsx_keys['签发日期'] = max(valid_time_list).strftime("%Y-%m-%d")

-    def remove_file(self, folder_path):
-        with os.scandir(folder_path) as entries:
-            for entry in entries:
-                if entry.is_file():
-                    file_path = entry.path
-                    try:
-                        os.remove(file_path)
-                    except Exception as e:
-                        pass
-
-    def run(self, folder_path):
-        with os.scandir(folder_path) as entries:
+    def discern(self, pdf_path, img_path, excel_path):
+        with os.scandir(pdf_path) as entries:
            for entry in entries:
                self.xlsx_keys = {
                    '登记日期': '',
@ -170,26 +124,30 @@ class Discern(Base):
                    '签发日期': '',
                    '公司名称': ''
                }
-                self.remove_file('./target_img')
+                self.remove_img(img_path)
                if entry.is_file():
-                    file_path = entry.path
+                    pdf_path = entry.path
                    file_name = entry.name
                    self.log(file_name)
                    self.xlsx_keys['文件名'] = file_name
-                    self.pdf_text(file_path)
-                    self.pdf_images(file_path)
-                    self.get_images_text()
-                    self.pdf_all_text(file_path)
+                    self.pdf_text(pdf_path)
+                    self.pdf_images(pdf_path, img_path)
+
+                    self.pdf_all_text(pdf_path)
                    if not self.xlsx_keys['方案编号']:
                        matches = re.findall(r'SST\d+BB', file_name)
                        if matches:
                            self.xlsx_keys['方案编号'] = matches[0]
                        else:
                            self.log("未找到匹配的模式方案编号")
+
                self.xlsx_keys_list.append(self.xlsx_keys)
-        self.export_excel(self.xlsx_keys_list)
+        self.export_excel(self.xlsx_keys_list, excel_path)
+
+    def run(self, pdf_path, img_path, excel_path):
+        self.discern(pdf_path, img_path, excel_path)


 if __name__ == '__main__':
-    discern = Discern()
-    discern.run('./file_test')
+    testing_agency_report = TestingAgencyReport()
+    testing_agency_report.run('../file_test', '../target_img', '../docs')