mirror of
https://github.com/luzhisheng/js_reverse.git
synced 2025-04-19 02:39:42 +08:00
159 lines
6.8 KiB
Python
159 lines
6.8 KiB
Python
from program.pdf_base import PDFBase
|
||
import pdfplumber
|
||
import cv2
|
||
import os
|
||
import re
|
||
|
||
|
||
class TestingAgencyReport(PDFBase):
|
||
|
||
def __init__(self):
|
||
super(TestingAgencyReport, self).__init__()
|
||
self.xlsx_keys = {}
|
||
self.xlsx_keys_list = []
|
||
self.num = 0
|
||
|
||
def pdf_images(self, pdf_path, img_path):
|
||
self.download_img(pdf_path, img_path)
|
||
with os.scandir(img_path) as entries:
|
||
for entry in entries:
|
||
if entry.is_file():
|
||
text_dict = {
|
||
'方案编号': '',
|
||
'签发日期': '',
|
||
'标志': ''
|
||
}
|
||
|
||
try:
|
||
lines = self.read_img_ocr_binarization(entry.path)
|
||
valid_time_list = []
|
||
for line in lines:
|
||
if 'S$T' in line or 'SST' in line:
|
||
text_dict['方案编号'] = line.replace('S$T', 'SST')\
|
||
.replace('试验方案编号:', '').replace('$', '')
|
||
if 'CNAS' in line:
|
||
text_dict['标志'] = 'cnas中文,'
|
||
valid_time = self.is_valid_time(line)
|
||
if valid_time:
|
||
valid_time_list.append(valid_time)
|
||
if valid_time_list:
|
||
text_dict['签发日期'] = max(valid_time_list).strftime("%Y-%m-%d")
|
||
|
||
lines = self.read_img_ocr(entry.path)
|
||
for line in lines:
|
||
if '200015344424' in line:
|
||
text_dict['标志'] = '国cma,'
|
||
if '191020340175' in line:
|
||
text_dict['标志'] = '省cma,'
|
||
except cv2.error as c:
|
||
self.log(c)
|
||
|
||
if text_dict.get('标志'):
|
||
self.xlsx_keys['标志'] += text_dict.get('标志')
|
||
if text_dict.get('方案编号'):
|
||
self.xlsx_keys['方案编号'] = text_dict.get('方案编号')
|
||
if text_dict.get('签发日期'):
|
||
self.xlsx_keys['签发日期'] = text_dict.get('签发日期')
|
||
|
||
def pdf_text(self, pdf_path):
|
||
with pdfplumber.open(pdf_path) as pdf:
|
||
page = pdf.pages[0]
|
||
# 提取页面文本
|
||
text = page.extract_text()
|
||
lines = text.split("\n")
|
||
line_str = ''
|
||
company = ''
|
||
for line in lines:
|
||
line_str += line
|
||
if 'CSTBB' in line:
|
||
for li in line.split():
|
||
if 'CSTBB' in li:
|
||
self.xlsx_keys['报告编号'] = li.strip().replace('报告编号:', '').replace(')', '')\
|
||
.replace('报告编号:', '')
|
||
if '样品名称' in line:
|
||
try:
|
||
self.xlsx_keys['样品名称'] = line.strip().replace('样品名称: ', '').replace('样品名称:', '')
|
||
except Exception as e:
|
||
print(e)
|
||
self.xlsx_keys['样品名称'] = ''
|
||
if 'Article Name:' in line:
|
||
self.xlsx_keys['样品名称'] = line.split("Article Name")[1].strip().replace(': ', '')
|
||
if '公司' in line:
|
||
company += line.strip()
|
||
if '最终报告' in line or 'Final Report' in line:
|
||
self.xlsx_keys['检测项目'] = line_str.replace('最终报告', '').replace('Final Report', '') \
|
||
.replace('中国认可国际互认检测', '')
|
||
|
||
company_list = company.split()
|
||
for company_str in company_list:
|
||
self.xlsx_keys['公司名称'] += company_str
|
||
self.xlsx_keys['公司名称'] = self.xlsx_keys['公司名称'].replace('中检华通威国际检验(苏州)有限公司', ''). \
|
||
replace('中检华通威国际检验(苏州)有限公司', '')
|
||
|
||
def pdf_all_text(self, pdf_path):
|
||
with pdfplumber.open(pdf_path) as pdf:
|
||
for page in pdf.pages[1:]:
|
||
# 提取页面文本
|
||
text = page.extract_text()
|
||
lines = text.split()
|
||
valid_time_list = []
|
||
for line in lines:
|
||
|
||
if 'SST' in line and not self.xlsx_keys['方案编号']:
|
||
self.xlsx_keys['方案编号'] = line
|
||
|
||
if '签发日期' in line and not self.xlsx_keys['签发日期']:
|
||
self.xlsx_keys['签发日期'] = line.replace('签发日期', '').replace(':', '')
|
||
|
||
if 'GLP' in line:
|
||
self.xlsx_keys['标志'] += 'GLP,'
|
||
|
||
valid_time = self.is_valid_time(line)
|
||
if valid_time:
|
||
valid_time_list.append(valid_time)
|
||
if valid_time_list:
|
||
self.xlsx_keys['签发日期'] = max(valid_time_list).strftime("%Y-%m-%d")
|
||
|
||
def discern(self, pdf_path, img_path, excel_path):
|
||
with os.scandir(pdf_path) as entries:
|
||
for entry in entries:
|
||
self.xlsx_keys = {
|
||
'登记日期': '',
|
||
'方案编号': '',
|
||
'样品编号': '',
|
||
'报告编号': '',
|
||
'样品名称': '',
|
||
'检测项目': '',
|
||
'标志': '',
|
||
'签发日期': '',
|
||
'公司名称': '',
|
||
'文件名': ''
|
||
}
|
||
self.remove_img(img_path)
|
||
if entry.is_file():
|
||
pdf_path = entry.path
|
||
file_name = entry.name
|
||
self.log(file_name)
|
||
self.xlsx_keys['文件名'] = file_name
|
||
self.pdf_text(pdf_path)
|
||
self.pdf_images(pdf_path, img_path)
|
||
|
||
self.pdf_all_text(pdf_path)
|
||
if not self.xlsx_keys['方案编号']:
|
||
matches = re.findall(r'SST\d+BB', file_name)
|
||
if matches:
|
||
self.xlsx_keys['方案编号'] = matches[0]
|
||
else:
|
||
self.log("未找到匹配的模式方案编号")
|
||
|
||
self.xlsx_keys_list.append(self.xlsx_keys)
|
||
self.export_excel(self.xlsx_keys_list, excel_path)
|
||
|
||
def run(self, pdf_path, img_path, excel_path):
|
||
self.discern(pdf_path, img_path, excel_path)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
testing_agency_report = TestingAgencyReport()
|
||
testing_agency_report.run('../file_test', '../target_img', '../docs')
|