提取pdf文件中图片和文字

2025-04-23 01:14:24 +08:00 · 2023-08-02 13:00:24 +08:00 · 2023-08-02 13:00:24 +08:00 · 2dc7cb4a6d
commit 2dc7cb4a6d
parent bc6a9102a5
3 changed files with 112 additions and 12 deletions
--- a/pdf处理/program/extract_from_pages.py
+++ b/pdf处理/program/extract_from_pages.py
@ -0,0 +1,88 @@
+import os
+import fitz
+
+dimlimit = 0  # 100  # 每个图像边缘的最小像素数限制
+relsize = 0  # 0.05  # 图像：图像尺寸比必须大于此值（5%）
+abssize = 0  # 2048  # 图像绝对大小限制 2 KB：如果小于此值，则忽略
+
+
+def recoverpix(doc, item):
+    '''
+    恢复像素
+    :param doc:
+    :param item:
+    :return:
+    '''
+    xref = item[0]  # PDF 图像的 xref
+    smask = item[1]  # 其 /SMask 的 xref
+
+    # 特殊情况：存在 /SMask 或 /Mask
+    if smask > 0:
+        pix0 = fitz.Pixmap(doc.extract_image(xref)["image"])
+        if pix0.alpha:  # 捕获异常情况
+            pix0 = fitz.Pixmap(pix0, 0)  # 删除 alpha 通道
+        mask = fitz.Pixmap(doc.extract_image(smask)["image"])
+
+        try:
+            pix = fitz.Pixmap(pix0, mask)
+        except:  # 如果有问题，回退到原始基本图像
+            pix = fitz.Pixmap(doc.extract_image(xref)["image"])
+
+        if pix0.n > 3:
+            ext = "pam"
+        else:
+            ext = "png"
+
+        return {  # 创建预期的字典
+            "ext": ext,
+            "colorspace": pix.colorspace.n,
+            "image": pix.tobytes(ext),
+        }
+
+    # 特殊情况：存在 /ColorSpace 定义
+    # 为确保安全，我们将这些情况转换为 RGB PNG 图像
+    if "/ColorSpace" in doc.xref_object(xref, compressed=True):
+        pix = fitz.Pixmap(doc, xref)
+        pix = fitz.Pixmap(fitz.csRGB, pix)
+        return {  # 创建预期的字典
+            "ext": "png",
+            "colorspace": 3,
+            "image": pix.tobytes("png"),
+        }
+    return doc.extract_image(xref)
+
+
+def read_pdf(pdf_path, output_folder):
+    doc = fitz.open(pdf_path)
+    page_count = doc.page_count
+    xreflist = []
+    imglist = []
+    for pno in range(page_count):
+        il = doc.get_page_images(pno)
+        imglist.extend([x[0] for x in il])
+        for img in il:
+            xref = img[0]
+            if xref in xreflist:
+                continue
+            width = img[2]
+            height = img[3]
+            if min(width, height) <= dimlimit:
+                continue
+            image = recoverpix(doc, img)
+            n = image["colorspace"]
+            imgdata = image["image"]
+
+            if len(imgdata) <= abssize:
+                continue
+            if len(imgdata) / (width * height * n) <= relsize:
+                continue
+
+            imgfile = os.path.join(output_folder, "img%05i.%s" % (xref, image["ext"]))
+            fout = open(imgfile, "wb")
+            fout.write(imgdata)
+            fout.close()
+            xreflist.append(xref)
+
+
+if __name__ == '__main__':
+    read_pdf('../1.pdf', '../target_img')
--- a/pdf处理/program/pdf_base.py
+++ b/pdf处理/program/pdf_base.py
@ -1,6 +1,6 @@
 from datetime import datetime
+from extract_from_pages import read_pdf
 from base import Base
-import subprocess
 from PIL import Image
 import pandas as pd
 import pytesseract
@ -29,15 +29,14 @@ class PDFBase(Base):
        :return:
        """
        try:
-            subprocess.run(f"pdfimages -png '{input_pdf}' {output_image}/image", shell=True,
-                           capture_output=True, text=True)
+            read_pdf(input_pdf, output_image)
        except Exception as e:
            self.log(f"出现异常：{e}")

    @staticmethod
-    def read_img_ocr(img_path, standard=205):
+    def read_img_ocr_binarization(img_path, standard=205):
        """
-        读取图片中文字内容
+        二值化读取图片中文字内容
        :param img_path:
        :return:
        """
@ -56,6 +55,19 @@ class PDFBase(Base):
        lines = result.split()
        return lines

+    @staticmethod
+    def read_img_ocr(img_path):
+        """
+        读取图片中文字内容
+        :param img_path:
+        :return:
+        """
+        img = Image.open(img_path)
+        # 图像识别
+        result = pytesseract.image_to_string(img, config=r'--oem 3 --psm 6 -l chi_sim+eng')
+        lines = result.split()
+        return lines
+
    def remove_img(self, img_path):
        """
        删除当前文件夹下所有的图片
--- a/pdf处理/program/testing_agency_report.py
+++ b/pdf处理/program/testing_agency_report.py
@ -25,23 +25,23 @@ class TestingAgencyReport(PDFBase):
                    }

                    try:
-                        lines = self.read_img_ocr(entry.path)
+                        lines = self.read_img_ocr_binarization(entry.path)
                        valid_time_list = []
                        for line in lines:
                            if 'S$T' in line or 'SST' in line:
                                text_dict['方案编号'] = line.replace('S$T', 'SST').replace('试验方案编号:', '')
-
                            if 'CNAS' in line:
                                text_dict['标志'] = 'cnas中文,'
-
-                            if '200015344424' in line:
-                                text_dict['标志'] = '国cma,'
-
                            valid_time = self.is_valid_time(line)
                            if valid_time:
                                valid_time_list.append(valid_time)
                        if valid_time_list:
                            text_dict['签发日期'] = max(valid_time_list).strftime("%Y-%m-%d")
+
+                        lines = self.read_img_ocr(entry.path)
+                        for line in lines:
+                            if '200015344424' in line:
+                                text_dict['标志'] = '国cma,'
                    except cv2.error as c:
                        self.log(c)

@ -150,4 +150,4 @@ class TestingAgencyReport(PDFBase):

 if __name__ == '__main__':
    testing_agency_report = TestingAgencyReport()
-    testing_agency_report.run('../file_test', '../target_img', '../docs')
+    testing_agency_report.run('../file', '../target_img', '../docs')