js_reverse/pdf处理/program/extract_from_pages.py
2023-08-02 13:00:24 +08:00

89 lines
2.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import fitz
dimlimit = 0 # 100 # 每个图像边缘的最小像素数限制
relsize = 0 # 0.05 # 图像图像尺寸比必须大于此值5%
abssize = 0 # 2048 # 图像绝对大小限制 2 KB如果小于此值则忽略
def recoverpix(doc, item):
'''
恢复像素
:param doc:
:param item:
:return:
'''
xref = item[0] # PDF 图像的 xref
smask = item[1] # 其 /SMask 的 xref
# 特殊情况:存在 /SMask 或 /Mask
if smask > 0:
pix0 = fitz.Pixmap(doc.extract_image(xref)["image"])
if pix0.alpha: # 捕获异常情况
pix0 = fitz.Pixmap(pix0, 0) # 删除 alpha 通道
mask = fitz.Pixmap(doc.extract_image(smask)["image"])
try:
pix = fitz.Pixmap(pix0, mask)
except: # 如果有问题,回退到原始基本图像
pix = fitz.Pixmap(doc.extract_image(xref)["image"])
if pix0.n > 3:
ext = "pam"
else:
ext = "png"
return { # 创建预期的字典
"ext": ext,
"colorspace": pix.colorspace.n,
"image": pix.tobytes(ext),
}
# 特殊情况:存在 /ColorSpace 定义
# 为确保安全,我们将这些情况转换为 RGB PNG 图像
if "/ColorSpace" in doc.xref_object(xref, compressed=True):
pix = fitz.Pixmap(doc, xref)
pix = fitz.Pixmap(fitz.csRGB, pix)
return { # 创建预期的字典
"ext": "png",
"colorspace": 3,
"image": pix.tobytes("png"),
}
return doc.extract_image(xref)
def read_pdf(pdf_path, output_folder):
doc = fitz.open(pdf_path)
page_count = doc.page_count
xreflist = []
imglist = []
for pno in range(page_count):
il = doc.get_page_images(pno)
imglist.extend([x[0] for x in il])
for img in il:
xref = img[0]
if xref in xreflist:
continue
width = img[2]
height = img[3]
if min(width, height) <= dimlimit:
continue
image = recoverpix(doc, img)
n = image["colorspace"]
imgdata = image["image"]
if len(imgdata) <= abssize:
continue
if len(imgdata) / (width * height * n) <= relsize:
continue
imgfile = os.path.join(output_folder, "img%05i.%s" % (xref, image["ext"]))
fout = open(imgfile, "wb")
fout.write(imgdata)
fout.close()
xreflist.append(xref)
if __name__ == '__main__':
read_pdf('../1.pdf', '../target_img')