mirror of
https://github.com/luzhisheng/js_reverse.git
synced 2025-04-20 10:25:01 +08:00
89 lines
2.6 KiB
Python
89 lines
2.6 KiB
Python
import os
|
||
import fitz
|
||
|
||
dimlimit = 0 # 100 # 每个图像边缘的最小像素数限制
|
||
relsize = 0 # 0.05 # 图像:图像尺寸比必须大于此值(5%)
|
||
abssize = 0 # 2048 # 图像绝对大小限制 2 KB:如果小于此值,则忽略
|
||
|
||
|
||
def recoverpix(doc, item):
|
||
'''
|
||
恢复像素
|
||
:param doc:
|
||
:param item:
|
||
:return:
|
||
'''
|
||
xref = item[0] # PDF 图像的 xref
|
||
smask = item[1] # 其 /SMask 的 xref
|
||
|
||
# 特殊情况:存在 /SMask 或 /Mask
|
||
if smask > 0:
|
||
pix0 = fitz.Pixmap(doc.extract_image(xref)["image"])
|
||
if pix0.alpha: # 捕获异常情况
|
||
pix0 = fitz.Pixmap(pix0, 0) # 删除 alpha 通道
|
||
mask = fitz.Pixmap(doc.extract_image(smask)["image"])
|
||
|
||
try:
|
||
pix = fitz.Pixmap(pix0, mask)
|
||
except: # 如果有问题,回退到原始基本图像
|
||
pix = fitz.Pixmap(doc.extract_image(xref)["image"])
|
||
|
||
if pix0.n > 3:
|
||
ext = "pam"
|
||
else:
|
||
ext = "png"
|
||
|
||
return { # 创建预期的字典
|
||
"ext": ext,
|
||
"colorspace": pix.colorspace.n,
|
||
"image": pix.tobytes(ext),
|
||
}
|
||
|
||
# 特殊情况:存在 /ColorSpace 定义
|
||
# 为确保安全,我们将这些情况转换为 RGB PNG 图像
|
||
if "/ColorSpace" in doc.xref_object(xref, compressed=True):
|
||
pix = fitz.Pixmap(doc, xref)
|
||
pix = fitz.Pixmap(fitz.csRGB, pix)
|
||
return { # 创建预期的字典
|
||
"ext": "png",
|
||
"colorspace": 3,
|
||
"image": pix.tobytes("png"),
|
||
}
|
||
return doc.extract_image(xref)
|
||
|
||
|
||
def read_pdf(pdf_path, output_folder):
|
||
doc = fitz.open(pdf_path)
|
||
page_count = doc.page_count
|
||
xreflist = []
|
||
imglist = []
|
||
for pno in range(page_count):
|
||
il = doc.get_page_images(pno)
|
||
imglist.extend([x[0] for x in il])
|
||
for img in il:
|
||
xref = img[0]
|
||
if xref in xreflist:
|
||
continue
|
||
width = img[2]
|
||
height = img[3]
|
||
if min(width, height) <= dimlimit:
|
||
continue
|
||
image = recoverpix(doc, img)
|
||
n = image["colorspace"]
|
||
imgdata = image["image"]
|
||
|
||
if len(imgdata) <= abssize:
|
||
continue
|
||
if len(imgdata) / (width * height * n) <= relsize:
|
||
continue
|
||
|
||
imgfile = os.path.join(output_folder, "img%05i.%s" % (xref, image["ext"]))
|
||
fout = open(imgfile, "wb")
|
||
fout.write(imgdata)
|
||
fout.close()
|
||
xreflist.append(xref)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
read_pdf('../1.pdf', '../target_img')
|