mirror of
https://github.com/luzhisheng/js_reverse.git
synced 2025-04-20 10:25:01 +08:00
pdf处理
This commit is contained in:
parent
c1afbed128
commit
5b0e238840
@ -1,32 +1,47 @@
|
|||||||
import pdfplumber
|
import pdfplumber
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
|
||||||
class Discern(object):
|
class Discern(object):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.xlsx_table = {}
|
self.xlsx_keys = {}
|
||||||
|
|
||||||
def pdf_text(self, path_pdf):
|
def pdf_text(self, pdf_path):
|
||||||
with pdfplumber.open(path_pdf) as pdf:
|
with pdfplumber.open(pdf_path) as pdf:
|
||||||
# 遍历每个页面
|
# 遍历每个页面
|
||||||
for page in pdf.pages:
|
for page in pdf.pages:
|
||||||
# 提取页面文本
|
# 提取页面文本
|
||||||
text = page.extract_text()
|
text = page.extract_text()
|
||||||
lines = text.split("\n")
|
lines = text.split("\n")
|
||||||
print(lines)
|
line_str = ''
|
||||||
for line in lines:
|
for line in lines:
|
||||||
# print(line)
|
line_str += line
|
||||||
if '报告编号:' in line:
|
if '报告编号:' in line:
|
||||||
self.xlsx_table['报告编号'] = line.split("报告编号")[1].strip()
|
self.xlsx_keys['报告编号'] = line.split("报告编号")[1].strip()
|
||||||
if '样品名称:' in line:
|
if '样品名称:' in line:
|
||||||
self.xlsx_table['样品名称'] = line.split("样品名称")[1].strip()
|
self.xlsx_keys['样品名称'] = line.split("样品名称")[1].strip()
|
||||||
if '公司' in line and '中检华通' not in line:
|
if '公司' in line and '中检华通' not in line and '制造商' not in line:
|
||||||
print(line)
|
self.xlsx_keys['公司名称'] = line.strip()
|
||||||
self.xlsx_table['公司名称'] = line.strip()
|
if '最终报告' in line:
|
||||||
print(self.xlsx_table)
|
self.xlsx_keys['检测项目'] = line_str.replace('最终报告', '')
|
||||||
|
|
||||||
|
def pdf_images(self, pdf_path):
|
||||||
|
i = 0
|
||||||
|
with pdfplumber.open(pdf_path) as pdf:
|
||||||
|
for page in pdf.pages:
|
||||||
|
images = page.images
|
||||||
|
for img in images:
|
||||||
|
# 获取图片的二进制流
|
||||||
|
i += 1
|
||||||
|
image_file = f"../img/image_{i}.png"
|
||||||
|
with open(image_file, "wb") as f:
|
||||||
|
f.write(img['stream'].get_data())
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
self.pdf_text('../docs/CSTBB23040149+SST2304000301BB+英姿医疗科技(湖南)有限公司+宫腔内窥镜+低温等离子灭菌+耐受性试验(中文).pdf')
|
self.pdf_text('../docs/2.pdf')
|
||||||
|
self.pdf_images('../docs/2.pdf')
|
||||||
|
print(self.xlsx_keys)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
Loading…
x
Reference in New Issue
Block a user