from program.pdf_base import PDFBase import pdfplumber import cv2 import os import re class TestingAgencyReport(PDFBase): def __init__(self): super(TestingAgencyReport, self).__init__() self.xlsx_keys = {} self.xlsx_keys_list = [] self.num = 0 def pdf_images(self, pdf_path, img_path): self.download_img(pdf_path, img_path) with os.scandir(img_path) as entries: for entry in entries: if entry.is_file(): text_dict = { '方案编号': '', '签发日期': '', '标志': '' } try: lines = self.read_img_ocr_binarization(entry.path) valid_time_list = [] for line in lines: if 'S$T' in line or 'SST' in line: text_dict['方案编号'] = line.replace('S$T', 'SST')\ .replace('试验方案编号:', '').replace('$', '') if 'CNAS' in line: text_dict['标志'] = 'cnas中文,' valid_time = self.is_valid_time(line) if valid_time: valid_time_list.append(valid_time) if valid_time_list: text_dict['签发日期'] = max(valid_time_list).strftime("%Y-%m-%d") lines = self.read_img_ocr(entry.path) for line in lines: if '200015344424' in line: text_dict['标志'] = '国cma,' if '191020340175' in line: text_dict['标志'] = '省cma,' except cv2.error as c: self.log(c) if text_dict.get('标志'): self.xlsx_keys['标志'] += text_dict.get('标志') if text_dict.get('方案编号'): self.xlsx_keys['方案编号'] = text_dict.get('方案编号') if text_dict.get('签发日期'): self.xlsx_keys['签发日期'] = text_dict.get('签发日期') def pdf_text(self, pdf_path): with pdfplumber.open(pdf_path) as pdf: page = pdf.pages[0] # 提取页面文本 text = page.extract_text() lines = text.split("\n") line_str = '' company = '' for line in lines: line_str += line if 'CSTBB' in line: for li in line.split(): if 'CSTBB' in li: self.xlsx_keys['报告编号'] = li.strip().replace('报告编号:', '').replace(')', '')\ .replace('报告编号:', '') if '样品名称' in line: try: self.xlsx_keys['样品名称'] = line.strip().replace('样品名称: ', '').replace('样品名称:', '') except Exception as e: print(e) self.xlsx_keys['样品名称'] = '' if 'Article Name:' in line: self.xlsx_keys['样品名称'] = line.split("Article Name")[1].strip().replace(': ', '') if '公司' in line: company += line.strip() if '最终报告' in line or 'Final Report' in line: self.xlsx_keys['检测项目'] = line_str.replace('最终报告', '').replace('Final Report', '') \ .replace('中国认可国际互认检测', '') company_list = company.split() for company_str in company_list: self.xlsx_keys['公司名称'] += company_str self.xlsx_keys['公司名称'] = self.xlsx_keys['公司名称'].replace('中检华通威国际检验(苏州)有限公司', ''). \ replace('中检华通威国际检验(苏州)有限公司', '') def pdf_all_text(self, pdf_path): with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages[1:]: # 提取页面文本 text = page.extract_text() lines = text.split() valid_time_list = [] for line in lines: if 'SST' in line and not self.xlsx_keys['方案编号']: self.xlsx_keys['方案编号'] = line if '签发日期' in line and not self.xlsx_keys['签发日期']: self.xlsx_keys['签发日期'] = line.replace('签发日期', '').replace(':', '') if 'GLP' in line: self.xlsx_keys['标志'] += 'GLP,' valid_time = self.is_valid_time(line) if valid_time: valid_time_list.append(valid_time) if valid_time_list: self.xlsx_keys['签发日期'] = max(valid_time_list).strftime("%Y-%m-%d") def discern(self, pdf_path, img_path, excel_path): with os.scandir(pdf_path) as entries: for entry in entries: self.xlsx_keys = { '登记日期': '', '方案编号': '', '样品编号': '', '报告编号': '', '样品名称': '', '检测项目': '', '标志': '', '签发日期': '', '公司名称': '', '文件名': '' } self.remove_img(img_path) if entry.is_file(): pdf_path = entry.path file_name = entry.name self.log(file_name) self.xlsx_keys['文件名'] = file_name self.pdf_text(pdf_path) self.pdf_images(pdf_path, img_path) self.pdf_all_text(pdf_path) if not self.xlsx_keys['方案编号']: matches = re.findall(r'SST\d+BB', file_name) if matches: self.xlsx_keys['方案编号'] = matches[0] else: self.log("未找到匹配的模式方案编号") self.xlsx_keys_list.append(self.xlsx_keys) self.export_excel(self.xlsx_keys_list, excel_path) def run(self, pdf_path, img_path, excel_path): self.discern(pdf_path, img_path, excel_path) if __name__ == '__main__': testing_agency_report = TestingAgencyReport() testing_agency_report.run('../file_test', '../target_img', '../docs')