mirror of
https://github.com/luzhisheng/js_reverse.git
synced 2025-04-21 00:25:09 +08:00
pdf 处理
This commit is contained in:
parent
b8f68e3a45
commit
c0cdf2f8b5
@ -1,6 +1,7 @@
|
|||||||
import image_compare
|
import image_compare
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import pdfplumber
|
import pdfplumber
|
||||||
|
import PyPDF2
|
||||||
from image_text_ocr import ImageTextOcr
|
from image_text_ocr import ImageTextOcr
|
||||||
import os
|
import os
|
||||||
import cv2
|
import cv2
|
||||||
@ -51,15 +52,18 @@ class Discern(object):
|
|||||||
|
|
||||||
def pdf_images(self, pdf_path):
|
def pdf_images(self, pdf_path):
|
||||||
self.num = 0
|
self.num = 0
|
||||||
with pdfplumber.open(pdf_path) as pdf:
|
pdf_reader = PyPDF2.PdfReader(pdf_path)
|
||||||
for page in pdf.pages:
|
for page_num in range(len(pdf_reader.pages)):
|
||||||
images = page.images
|
page = pdf_reader.pages[page_num]
|
||||||
for img in images:
|
xObject = page['/Resources']['/XObject'].get_object()
|
||||||
# 获取图片的二进制流
|
|
||||||
|
for obj in xObject:
|
||||||
|
if xObject[obj]['/Subtype'] == '/Image':
|
||||||
|
size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
|
||||||
self.num += 1
|
self.num += 1
|
||||||
image_file = f"../target_img/image_{self.num}.png"
|
image_file = f"../target_img/image_{self.num}.png"
|
||||||
with open(image_file, "wb") as f:
|
with open(image_file, "wb") as f:
|
||||||
f.write(img['stream'].get_data())
|
f.write(xObject[obj].get_data())
|
||||||
|
|
||||||
def get_images_text(self):
|
def get_images_text(self):
|
||||||
for i in range(1, self.num + 1):
|
for i in range(1, self.num + 1):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user