From c0cdf2f8b5c9aacf548e0c01498ffd71a8da0109 Mon Sep 17 00:00:00 2001 From: aiyingfeng Date: Mon, 24 Jul 2023 18:43:27 +0800 Subject: [PATCH] =?UTF-8?q?pdf=20=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pdf处理/program/discern.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/pdf处理/program/discern.py b/pdf处理/program/discern.py index afaf469..c715e05 100644 --- a/pdf处理/program/discern.py +++ b/pdf处理/program/discern.py @@ -1,6 +1,7 @@ import image_compare import pandas as pd import pdfplumber +import PyPDF2 from image_text_ocr import ImageTextOcr import os import cv2 @@ -51,15 +52,18 @@ class Discern(object): def pdf_images(self, pdf_path): self.num = 0 - with pdfplumber.open(pdf_path) as pdf: - for page in pdf.pages: - images = page.images - for img in images: - # 获取图片的二进制流 + pdf_reader = PyPDF2.PdfReader(pdf_path) + for page_num in range(len(pdf_reader.pages)): + page = pdf_reader.pages[page_num] + xObject = page['/Resources']['/XObject'].get_object() + + for obj in xObject: + if xObject[obj]['/Subtype'] == '/Image': + size = (xObject[obj]['/Width'], xObject[obj]['/Height']) self.num += 1 image_file = f"../target_img/image_{self.num}.png" with open(image_file, "wb") as f: - f.write(img['stream'].get_data()) + f.write(xObject[obj].get_data()) def get_images_text(self): for i in range(1, self.num + 1):