mirror of
https://github.com/luzhisheng/js_reverse.git
synced 2025-04-23 01:14:24 +08:00
提取pdf文件中图片和文字
This commit is contained in:
parent
bc6a9102a5
commit
2dc7cb4a6d
88
pdf处理/program/extract_from_pages.py
Normal file
88
pdf处理/program/extract_from_pages.py
Normal file
@ -0,0 +1,88 @@
|
||||
import os
|
||||
import fitz
|
||||
|
||||
dimlimit = 0 # 100 # 每个图像边缘的最小像素数限制
|
||||
relsize = 0 # 0.05 # 图像:图像尺寸比必须大于此值(5%)
|
||||
abssize = 0 # 2048 # 图像绝对大小限制 2 KB:如果小于此值,则忽略
|
||||
|
||||
|
||||
def recoverpix(doc, item):
|
||||
'''
|
||||
恢复像素
|
||||
:param doc:
|
||||
:param item:
|
||||
:return:
|
||||
'''
|
||||
xref = item[0] # PDF 图像的 xref
|
||||
smask = item[1] # 其 /SMask 的 xref
|
||||
|
||||
# 特殊情况:存在 /SMask 或 /Mask
|
||||
if smask > 0:
|
||||
pix0 = fitz.Pixmap(doc.extract_image(xref)["image"])
|
||||
if pix0.alpha: # 捕获异常情况
|
||||
pix0 = fitz.Pixmap(pix0, 0) # 删除 alpha 通道
|
||||
mask = fitz.Pixmap(doc.extract_image(smask)["image"])
|
||||
|
||||
try:
|
||||
pix = fitz.Pixmap(pix0, mask)
|
||||
except: # 如果有问题,回退到原始基本图像
|
||||
pix = fitz.Pixmap(doc.extract_image(xref)["image"])
|
||||
|
||||
if pix0.n > 3:
|
||||
ext = "pam"
|
||||
else:
|
||||
ext = "png"
|
||||
|
||||
return { # 创建预期的字典
|
||||
"ext": ext,
|
||||
"colorspace": pix.colorspace.n,
|
||||
"image": pix.tobytes(ext),
|
||||
}
|
||||
|
||||
# 特殊情况:存在 /ColorSpace 定义
|
||||
# 为确保安全,我们将这些情况转换为 RGB PNG 图像
|
||||
if "/ColorSpace" in doc.xref_object(xref, compressed=True):
|
||||
pix = fitz.Pixmap(doc, xref)
|
||||
pix = fitz.Pixmap(fitz.csRGB, pix)
|
||||
return { # 创建预期的字典
|
||||
"ext": "png",
|
||||
"colorspace": 3,
|
||||
"image": pix.tobytes("png"),
|
||||
}
|
||||
return doc.extract_image(xref)
|
||||
|
||||
|
||||
def read_pdf(pdf_path, output_folder):
|
||||
doc = fitz.open(pdf_path)
|
||||
page_count = doc.page_count
|
||||
xreflist = []
|
||||
imglist = []
|
||||
for pno in range(page_count):
|
||||
il = doc.get_page_images(pno)
|
||||
imglist.extend([x[0] for x in il])
|
||||
for img in il:
|
||||
xref = img[0]
|
||||
if xref in xreflist:
|
||||
continue
|
||||
width = img[2]
|
||||
height = img[3]
|
||||
if min(width, height) <= dimlimit:
|
||||
continue
|
||||
image = recoverpix(doc, img)
|
||||
n = image["colorspace"]
|
||||
imgdata = image["image"]
|
||||
|
||||
if len(imgdata) <= abssize:
|
||||
continue
|
||||
if len(imgdata) / (width * height * n) <= relsize:
|
||||
continue
|
||||
|
||||
imgfile = os.path.join(output_folder, "img%05i.%s" % (xref, image["ext"]))
|
||||
fout = open(imgfile, "wb")
|
||||
fout.write(imgdata)
|
||||
fout.close()
|
||||
xreflist.append(xref)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
read_pdf('../1.pdf', '../target_img')
|
@ -1,6 +1,6 @@
|
||||
from datetime import datetime
|
||||
from extract_from_pages import read_pdf
|
||||
from base import Base
|
||||
import subprocess
|
||||
from PIL import Image
|
||||
import pandas as pd
|
||||
import pytesseract
|
||||
@ -29,15 +29,14 @@ class PDFBase(Base):
|
||||
:return:
|
||||
"""
|
||||
try:
|
||||
subprocess.run(f"pdfimages -png '{input_pdf}' {output_image}/image", shell=True,
|
||||
capture_output=True, text=True)
|
||||
read_pdf(input_pdf, output_image)
|
||||
except Exception as e:
|
||||
self.log(f"出现异常:{e}")
|
||||
|
||||
@staticmethod
|
||||
def read_img_ocr(img_path, standard=205):
|
||||
def read_img_ocr_binarization(img_path, standard=205):
|
||||
"""
|
||||
读取图片中文字内容
|
||||
二值化读取图片中文字内容
|
||||
:param img_path:
|
||||
:return:
|
||||
"""
|
||||
@ -56,6 +55,19 @@ class PDFBase(Base):
|
||||
lines = result.split()
|
||||
return lines
|
||||
|
||||
@staticmethod
|
||||
def read_img_ocr(img_path):
|
||||
"""
|
||||
读取图片中文字内容
|
||||
:param img_path:
|
||||
:return:
|
||||
"""
|
||||
img = Image.open(img_path)
|
||||
# 图像识别
|
||||
result = pytesseract.image_to_string(img, config=r'--oem 3 --psm 6 -l chi_sim+eng')
|
||||
lines = result.split()
|
||||
return lines
|
||||
|
||||
def remove_img(self, img_path):
|
||||
"""
|
||||
删除当前文件夹下所有的图片
|
||||
|
@ -25,23 +25,23 @@ class TestingAgencyReport(PDFBase):
|
||||
}
|
||||
|
||||
try:
|
||||
lines = self.read_img_ocr(entry.path)
|
||||
lines = self.read_img_ocr_binarization(entry.path)
|
||||
valid_time_list = []
|
||||
for line in lines:
|
||||
if 'S$T' in line or 'SST' in line:
|
||||
text_dict['方案编号'] = line.replace('S$T', 'SST').replace('试验方案编号:', '')
|
||||
|
||||
if 'CNAS' in line:
|
||||
text_dict['标志'] = 'cnas中文,'
|
||||
|
||||
if '200015344424' in line:
|
||||
text_dict['标志'] = '国cma,'
|
||||
|
||||
valid_time = self.is_valid_time(line)
|
||||
if valid_time:
|
||||
valid_time_list.append(valid_time)
|
||||
if valid_time_list:
|
||||
text_dict['签发日期'] = max(valid_time_list).strftime("%Y-%m-%d")
|
||||
|
||||
lines = self.read_img_ocr(entry.path)
|
||||
for line in lines:
|
||||
if '200015344424' in line:
|
||||
text_dict['标志'] = '国cma,'
|
||||
except cv2.error as c:
|
||||
self.log(c)
|
||||
|
||||
@ -150,4 +150,4 @@ class TestingAgencyReport(PDFBase):
|
||||
|
||||
if __name__ == '__main__':
|
||||
testing_agency_report = TestingAgencyReport()
|
||||
testing_agency_report.run('../file_test', '../target_img', '../docs')
|
||||
testing_agency_report.run('../file', '../target_img', '../docs')
|
||||
|
Loading…
x
Reference in New Issue
Block a user