提取pdf文件中图片和文字

This commit is contained in:
aiyingfeng 2023-08-02 13:00:24 +08:00
parent bc6a9102a5
commit 2dc7cb4a6d
3 changed files with 112 additions and 12 deletions

View File

@ -0,0 +1,88 @@
import os
import fitz
dimlimit = 0 # 100 # 每个图像边缘的最小像素数限制
relsize = 0 # 0.05 # 图像图像尺寸比必须大于此值5%
abssize = 0 # 2048 # 图像绝对大小限制 2 KB如果小于此值则忽略
def recoverpix(doc, item):
'''
恢复像素
:param doc:
:param item:
:return:
'''
xref = item[0] # PDF 图像的 xref
smask = item[1] # 其 /SMask 的 xref
# 特殊情况:存在 /SMask 或 /Mask
if smask > 0:
pix0 = fitz.Pixmap(doc.extract_image(xref)["image"])
if pix0.alpha: # 捕获异常情况
pix0 = fitz.Pixmap(pix0, 0) # 删除 alpha 通道
mask = fitz.Pixmap(doc.extract_image(smask)["image"])
try:
pix = fitz.Pixmap(pix0, mask)
except: # 如果有问题,回退到原始基本图像
pix = fitz.Pixmap(doc.extract_image(xref)["image"])
if pix0.n > 3:
ext = "pam"
else:
ext = "png"
return { # 创建预期的字典
"ext": ext,
"colorspace": pix.colorspace.n,
"image": pix.tobytes(ext),
}
# 特殊情况:存在 /ColorSpace 定义
# 为确保安全,我们将这些情况转换为 RGB PNG 图像
if "/ColorSpace" in doc.xref_object(xref, compressed=True):
pix = fitz.Pixmap(doc, xref)
pix = fitz.Pixmap(fitz.csRGB, pix)
return { # 创建预期的字典
"ext": "png",
"colorspace": 3,
"image": pix.tobytes("png"),
}
return doc.extract_image(xref)
def read_pdf(pdf_path, output_folder):
doc = fitz.open(pdf_path)
page_count = doc.page_count
xreflist = []
imglist = []
for pno in range(page_count):
il = doc.get_page_images(pno)
imglist.extend([x[0] for x in il])
for img in il:
xref = img[0]
if xref in xreflist:
continue
width = img[2]
height = img[3]
if min(width, height) <= dimlimit:
continue
image = recoverpix(doc, img)
n = image["colorspace"]
imgdata = image["image"]
if len(imgdata) <= abssize:
continue
if len(imgdata) / (width * height * n) <= relsize:
continue
imgfile = os.path.join(output_folder, "img%05i.%s" % (xref, image["ext"]))
fout = open(imgfile, "wb")
fout.write(imgdata)
fout.close()
xreflist.append(xref)
if __name__ == '__main__':
read_pdf('../1.pdf', '../target_img')

View File

@ -1,6 +1,6 @@
from datetime import datetime
from extract_from_pages import read_pdf
from base import Base
import subprocess
from PIL import Image
import pandas as pd
import pytesseract
@ -29,15 +29,14 @@ class PDFBase(Base):
:return:
"""
try:
subprocess.run(f"pdfimages -png '{input_pdf}' {output_image}/image", shell=True,
capture_output=True, text=True)
read_pdf(input_pdf, output_image)
except Exception as e:
self.log(f"出现异常:{e}")
@staticmethod
def read_img_ocr(img_path, standard=205):
def read_img_ocr_binarization(img_path, standard=205):
"""
读取图片中文字内容
二值化读取图片中文字内容
:param img_path:
:return:
"""
@ -56,6 +55,19 @@ class PDFBase(Base):
lines = result.split()
return lines
@staticmethod
def read_img_ocr(img_path):
"""
读取图片中文字内容
:param img_path:
:return:
"""
img = Image.open(img_path)
# 图像识别
result = pytesseract.image_to_string(img, config=r'--oem 3 --psm 6 -l chi_sim+eng')
lines = result.split()
return lines
def remove_img(self, img_path):
"""
删除当前文件夹下所有的图片

View File

@ -25,23 +25,23 @@ class TestingAgencyReport(PDFBase):
}
try:
lines = self.read_img_ocr(entry.path)
lines = self.read_img_ocr_binarization(entry.path)
valid_time_list = []
for line in lines:
if 'S$T' in line or 'SST' in line:
text_dict['方案编号'] = line.replace('S$T', 'SST').replace('试验方案编号:', '')
if 'CNAS' in line:
text_dict['标志'] = 'cnas中文,'
if '200015344424' in line:
text_dict['标志'] = '国cma,'
valid_time = self.is_valid_time(line)
if valid_time:
valid_time_list.append(valid_time)
if valid_time_list:
text_dict['签发日期'] = max(valid_time_list).strftime("%Y-%m-%d")
lines = self.read_img_ocr(entry.path)
for line in lines:
if '200015344424' in line:
text_dict['标志'] = '国cma,'
except cv2.error as c:
self.log(c)
@ -150,4 +150,4 @@ class TestingAgencyReport(PDFBase):
if __name__ == '__main__':
testing_agency_report = TestingAgencyReport()
testing_agency_report.run('../file_test', '../target_img', '../docs')
testing_agency_report.run('../file', '../target_img', '../docs')