From 5b0e2388407e4a2f0d544ef627cd3c82f4cb8592 Mon Sep 17 00:00:00 2001 From: luzhisheng Date: Tue, 18 Jul 2023 02:09:24 +0800 Subject: [PATCH] =?UTF-8?q?pdf=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pdf处理/program/discern.py | 39 ++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/pdf处理/program/discern.py b/pdf处理/program/discern.py index 9a42dc7..9b5214d 100644 --- a/pdf处理/program/discern.py +++ b/pdf处理/program/discern.py @@ -1,32 +1,47 @@ import pdfplumber +from PIL import Image class Discern(object): def __init__(self): - self.xlsx_table = {} + self.xlsx_keys = {} - def pdf_text(self, path_pdf): - with pdfplumber.open(path_pdf) as pdf: + def pdf_text(self, pdf_path): + with pdfplumber.open(pdf_path) as pdf: # 遍历每个页面 for page in pdf.pages: # 提取页面文本 text = page.extract_text() lines = text.split("\n") - print(lines) + line_str = '' for line in lines: - # print(line) + line_str += line if '报告编号:' in line: - self.xlsx_table['报告编号'] = line.split("报告编号")[1].strip() + self.xlsx_keys['报告编号'] = line.split("报告编号")[1].strip() if '样品名称:' in line: - self.xlsx_table['样品名称'] = line.split("样品名称")[1].strip() - if '公司' in line and '中检华通' not in line: - print(line) - self.xlsx_table['公司名称'] = line.strip() - print(self.xlsx_table) + self.xlsx_keys['样品名称'] = line.split("样品名称")[1].strip() + if '公司' in line and '中检华通' not in line and '制造商' not in line: + self.xlsx_keys['公司名称'] = line.strip() + if '最终报告' in line: + self.xlsx_keys['检测项目'] = line_str.replace('最终报告', '') + + def pdf_images(self, pdf_path): + i = 0 + with pdfplumber.open(pdf_path) as pdf: + for page in pdf.pages: + images = page.images + for img in images: + # 获取图片的二进制流 + i += 1 + image_file = f"../img/image_{i}.png" + with open(image_file, "wb") as f: + f.write(img['stream'].get_data()) def run(self): - self.pdf_text('../docs/CSTBB23040149+SST2304000301BB+英姿医疗科技(湖南)有限公司+宫腔内窥镜+低温等离子灭菌+耐受性试验(中文).pdf') + self.pdf_text('../docs/2.pdf') + self.pdf_images('../docs/2.pdf') + print(self.xlsx_keys) if __name__ == '__main__':