pdf处理

This commit is contained in:
luzhisheng 2023-07-18 02:09:24 +08:00
parent c1afbed128
commit 5b0e238840

View File

@ -1,32 +1,47 @@
import pdfplumber import pdfplumber
from PIL import Image
class Discern(object): class Discern(object):
def __init__(self): def __init__(self):
self.xlsx_table = {} self.xlsx_keys = {}
def pdf_text(self, path_pdf): def pdf_text(self, pdf_path):
with pdfplumber.open(path_pdf) as pdf: with pdfplumber.open(pdf_path) as pdf:
# 遍历每个页面 # 遍历每个页面
for page in pdf.pages: for page in pdf.pages:
# 提取页面文本 # 提取页面文本
text = page.extract_text() text = page.extract_text()
lines = text.split("\n") lines = text.split("\n")
print(lines) line_str = ''
for line in lines: for line in lines:
# print(line) line_str += line
if '报告编号:' in line: if '报告编号:' in line:
self.xlsx_table['报告编号'] = line.split("报告编号")[1].strip() self.xlsx_keys['报告编号'] = line.split("报告编号")[1].strip()
if '样品名称:' in line: if '样品名称:' in line:
self.xlsx_table['样品名称'] = line.split("样品名称")[1].strip() self.xlsx_keys['样品名称'] = line.split("样品名称")[1].strip()
if '公司' in line and '中检华通' not in line: if '公司' in line and '中检华通' not in line and '制造商' not in line:
print(line) self.xlsx_keys['公司名称'] = line.strip()
self.xlsx_table['公司名称'] = line.strip() if '最终报告' in line:
print(self.xlsx_table) self.xlsx_keys['检测项目'] = line_str.replace('最终报告', '')
def pdf_images(self, pdf_path):
i = 0
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
images = page.images
for img in images:
# 获取图片的二进制流
i += 1
image_file = f"../img/image_{i}.png"
with open(image_file, "wb") as f:
f.write(img['stream'].get_data())
def run(self): def run(self):
self.pdf_text('../docs/CSTBB23040149+SST2304000301BB+英姿医疗科技(湖南)有限公司+宫腔内窥镜+低温等离子灭菌+耐受性试验(中文).pdf') self.pdf_text('../docs/2.pdf')
self.pdf_images('../docs/2.pdf')
print(self.xlsx_keys)
if __name__ == '__main__': if __name__ == '__main__':