diff --git a/pdf处理/program/extract_from_pages.py b/pdf处理/program/extract_from_pages.py new file mode 100644 index 0000000..a47263f --- /dev/null +++ b/pdf处理/program/extract_from_pages.py @@ -0,0 +1,88 @@ +import os +import fitz + +dimlimit = 0 # 100 # 每个图像边缘的最小像素数限制 +relsize = 0 # 0.05 # 图像:图像尺寸比必须大于此值(5%) +abssize = 0 # 2048 # 图像绝对大小限制 2 KB:如果小于此值,则忽略 + + +def recoverpix(doc, item): + ''' + 恢复像素 + :param doc: + :param item: + :return: + ''' + xref = item[0] # PDF 图像的 xref + smask = item[1] # 其 /SMask 的 xref + + # 特殊情况:存在 /SMask 或 /Mask + if smask > 0: + pix0 = fitz.Pixmap(doc.extract_image(xref)["image"]) + if pix0.alpha: # 捕获异常情况 + pix0 = fitz.Pixmap(pix0, 0) # 删除 alpha 通道 + mask = fitz.Pixmap(doc.extract_image(smask)["image"]) + + try: + pix = fitz.Pixmap(pix0, mask) + except: # 如果有问题,回退到原始基本图像 + pix = fitz.Pixmap(doc.extract_image(xref)["image"]) + + if pix0.n > 3: + ext = "pam" + else: + ext = "png" + + return { # 创建预期的字典 + "ext": ext, + "colorspace": pix.colorspace.n, + "image": pix.tobytes(ext), + } + + # 特殊情况:存在 /ColorSpace 定义 + # 为确保安全,我们将这些情况转换为 RGB PNG 图像 + if "/ColorSpace" in doc.xref_object(xref, compressed=True): + pix = fitz.Pixmap(doc, xref) + pix = fitz.Pixmap(fitz.csRGB, pix) + return { # 创建预期的字典 + "ext": "png", + "colorspace": 3, + "image": pix.tobytes("png"), + } + return doc.extract_image(xref) + + +def read_pdf(pdf_path, output_folder): + doc = fitz.open(pdf_path) + page_count = doc.page_count + xreflist = [] + imglist = [] + for pno in range(page_count): + il = doc.get_page_images(pno) + imglist.extend([x[0] for x in il]) + for img in il: + xref = img[0] + if xref in xreflist: + continue + width = img[2] + height = img[3] + if min(width, height) <= dimlimit: + continue + image = recoverpix(doc, img) + n = image["colorspace"] + imgdata = image["image"] + + if len(imgdata) <= abssize: + continue + if len(imgdata) / (width * height * n) <= relsize: + continue + + imgfile = os.path.join(output_folder, "img%05i.%s" % (xref, image["ext"])) + fout = open(imgfile, "wb") + fout.write(imgdata) + fout.close() + xreflist.append(xref) + + +if __name__ == '__main__': + read_pdf('../1.pdf', '../target_img') diff --git a/pdf处理/program/pdf_base.py b/pdf处理/program/pdf_base.py index 251c59d..0934d26 100644 --- a/pdf处理/program/pdf_base.py +++ b/pdf处理/program/pdf_base.py @@ -1,6 +1,6 @@ from datetime import datetime +from extract_from_pages import read_pdf from base import Base -import subprocess from PIL import Image import pandas as pd import pytesseract @@ -29,15 +29,14 @@ class PDFBase(Base): :return: """ try: - subprocess.run(f"pdfimages -png '{input_pdf}' {output_image}/image", shell=True, - capture_output=True, text=True) + read_pdf(input_pdf, output_image) except Exception as e: self.log(f"出现异常:{e}") @staticmethod - def read_img_ocr(img_path, standard=205): + def read_img_ocr_binarization(img_path, standard=205): """ - 读取图片中文字内容 + 二值化读取图片中文字内容 :param img_path: :return: """ @@ -56,6 +55,19 @@ class PDFBase(Base): lines = result.split() return lines + @staticmethod + def read_img_ocr(img_path): + """ + 读取图片中文字内容 + :param img_path: + :return: + """ + img = Image.open(img_path) + # 图像识别 + result = pytesseract.image_to_string(img, config=r'--oem 3 --psm 6 -l chi_sim+eng') + lines = result.split() + return lines + def remove_img(self, img_path): """ 删除当前文件夹下所有的图片 diff --git a/pdf处理/program/testing_agency_report.py b/pdf处理/program/testing_agency_report.py index 5dc441c..7cf92d7 100644 --- a/pdf处理/program/testing_agency_report.py +++ b/pdf处理/program/testing_agency_report.py @@ -25,23 +25,23 @@ class TestingAgencyReport(PDFBase): } try: - lines = self.read_img_ocr(entry.path) + lines = self.read_img_ocr_binarization(entry.path) valid_time_list = [] for line in lines: if 'S$T' in line or 'SST' in line: text_dict['方案编号'] = line.replace('S$T', 'SST').replace('试验方案编号:', '') - if 'CNAS' in line: text_dict['标志'] = 'cnas中文,' - - if '200015344424' in line: - text_dict['标志'] = '国cma,' - valid_time = self.is_valid_time(line) if valid_time: valid_time_list.append(valid_time) if valid_time_list: text_dict['签发日期'] = max(valid_time_list).strftime("%Y-%m-%d") + + lines = self.read_img_ocr(entry.path) + for line in lines: + if '200015344424' in line: + text_dict['标志'] = '国cma,' except cv2.error as c: self.log(c) @@ -150,4 +150,4 @@ class TestingAgencyReport(PDFBase): if __name__ == '__main__': testing_agency_report = TestingAgencyReport() - testing_agency_report.run('../file_test', '../target_img', '../docs') + testing_agency_report.run('../file', '../target_img', '../docs')