js_reverse/pdf处理/program/testing_agency_report.py
2023-08-05 10:41:28 +08:00

159 lines
6.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from program.pdf_base import PDFBase
import pdfplumber
import cv2
import os
import re
class TestingAgencyReport(PDFBase):
def __init__(self):
super(TestingAgencyReport, self).__init__()
self.xlsx_keys = {}
self.xlsx_keys_list = []
self.num = 0
def pdf_images(self, pdf_path, img_path):
self.download_img(pdf_path, img_path)
with os.scandir(img_path) as entries:
for entry in entries:
if entry.is_file():
text_dict = {
'方案编号': '',
'签发日期': '',
'标志': ''
}
try:
lines = self.read_img_ocr_binarization(entry.path)
valid_time_list = []
for line in lines:
if 'S$T' in line or 'SST' in line:
text_dict['方案编号'] = line.replace('S$T', 'SST')\
.replace('试验方案编号:', '').replace('$', '')
if 'CNAS' in line:
text_dict['标志'] = 'cnas中文,'
valid_time = self.is_valid_time(line)
if valid_time:
valid_time_list.append(valid_time)
if valid_time_list:
text_dict['签发日期'] = max(valid_time_list).strftime("%Y-%m-%d")
lines = self.read_img_ocr(entry.path)
for line in lines:
if '200015344424' in line:
text_dict['标志'] = '国cma,'
if '191020340175' in line:
text_dict['标志'] = '省cma,'
except cv2.error as c:
self.log(c)
if text_dict.get('标志'):
self.xlsx_keys['标志'] += text_dict.get('标志')
if text_dict.get('方案编号'):
self.xlsx_keys['方案编号'] = text_dict.get('方案编号')
if text_dict.get('签发日期'):
self.xlsx_keys['签发日期'] = text_dict.get('签发日期')
def pdf_text(self, pdf_path):
with pdfplumber.open(pdf_path) as pdf:
page = pdf.pages[0]
# 提取页面文本
text = page.extract_text()
lines = text.split("\n")
line_str = ''
company = ''
for line in lines:
line_str += line
if 'CSTBB' in line:
for li in line.split():
if 'CSTBB' in li:
self.xlsx_keys['报告编号'] = li.strip().replace('报告编号:', '').replace('', '')\
.replace('报告编号:', '')
if '样品名称' in line:
try:
self.xlsx_keys['样品名称'] = line.strip().replace('样品名称: ', '').replace('样品名称:', '')
except Exception as e:
print(e)
self.xlsx_keys['样品名称'] = ''
if 'Article Name:' in line:
self.xlsx_keys['样品名称'] = line.split("Article Name")[1].strip().replace(': ', '')
if '公司' in line:
company += line.strip()
if '最终报告' in line or 'Final Report' in line:
self.xlsx_keys['检测项目'] = line_str.replace('最终报告', '').replace('Final Report', '') \
.replace('中国认可国际互认检测', '')
company_list = company.split()
for company_str in company_list:
self.xlsx_keys['公司名称'] += company_str
self.xlsx_keys['公司名称'] = self.xlsx_keys['公司名称'].replace('中检华通威国际检验(苏州)有限公司', ''). \
replace('中检华通威国际检验(苏州)有限公司', '')
def pdf_all_text(self, pdf_path):
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages[1:]:
# 提取页面文本
text = page.extract_text()
lines = text.split()
valid_time_list = []
for line in lines:
if 'SST' in line and not self.xlsx_keys['方案编号']:
self.xlsx_keys['方案编号'] = line
if '签发日期' in line and not self.xlsx_keys['签发日期']:
self.xlsx_keys['签发日期'] = line.replace('签发日期', '').replace('', '')
if 'GLP' in line:
self.xlsx_keys['标志'] += 'GLP,'
valid_time = self.is_valid_time(line)
if valid_time:
valid_time_list.append(valid_time)
if valid_time_list:
self.xlsx_keys['签发日期'] = max(valid_time_list).strftime("%Y-%m-%d")
def discern(self, pdf_path, img_path, excel_path):
with os.scandir(pdf_path) as entries:
for entry in entries:
self.xlsx_keys = {
'登记日期': '',
'方案编号': '',
'样品编号': '',
'报告编号': '',
'样品名称': '',
'检测项目': '',
'标志': '',
'签发日期': '',
'公司名称': '',
'文件名': ''
}
self.remove_img(img_path)
if entry.is_file():
pdf_path = entry.path
file_name = entry.name
self.log(file_name)
self.xlsx_keys['文件名'] = file_name
self.pdf_text(pdf_path)
self.pdf_images(pdf_path, img_path)
self.pdf_all_text(pdf_path)
if not self.xlsx_keys['方案编号']:
matches = re.findall(r'SST\d+BB', file_name)
if matches:
self.xlsx_keys['方案编号'] = matches[0]
else:
self.log("未找到匹配的模式方案编号")
self.xlsx_keys_list.append(self.xlsx_keys)
self.export_excel(self.xlsx_keys_list, excel_path)
def run(self, pdf_path, img_path, excel_path):
self.discern(pdf_path, img_path, excel_path)
if __name__ == '__main__':
testing_agency_report = TestingAgencyReport()
testing_agency_report.run('../file_test', '../target_img', '../docs')