diff --git a/pdf处理/base.py b/pdf处理/base.py new file mode 100644 index 0000000..8775bc1 --- /dev/null +++ b/pdf处理/base.py @@ -0,0 +1,10 @@ +from datetime import datetime + + +class Base(object): + + def __init__(self): + pass + + def log(self, s): + print('【%s】 %s' % (datetime.now(), s), flush=True) diff --git a/pdf处理/program/discern.py b/pdf处理/program/discern.py index 378f30f..dc47368 100644 --- a/pdf处理/program/discern.py +++ b/pdf处理/program/discern.py @@ -2,14 +2,16 @@ import pandas as pd import pdfplumber import PyPDF2 from datetime import datetime +from base import Base from image_text_ocr import ImageTextOcr import os import cv2 -class Discern(object): +class Discern(Base): def __init__(self): + super(Discern, self).__init__() self.image_text_ocr = ImageTextOcr() self.xlsx_keys = {} self.xlsx_keys_list = [] @@ -19,7 +21,7 @@ class Discern(object): # 将字典列表转换为DataFrame pf = pd.DataFrame(list(export)) current_time = datetime.now() - formatted_time = current_time.strftime('%Y-%m-%d %H:%M:%S') + formatted_time = current_time.strftime('%Y-%m-%d-%H-%M-%S') file_path = pd.ExcelWriter(f'../docs/{formatted_time}.xlsx') # 替换空单元格 pf.fillna(' ', inplace=True) @@ -28,14 +30,38 @@ class Discern(object): # 保存表格 file_path.close() + def is_valid_time(self, input_str): + try: + valid_time = datetime.strptime(input_str, "%Y-%m-%d") # 根据实际时间格式调整 + return valid_time + except ValueError: + return False + + def pdf_all_text(self, pdf_path): + with pdfplumber.open(pdf_path) as pdf: + for page in pdf.pages[1:]: + # 提取页面文本 + text = page.extract_text() + lines = text.split() + valid_time_list = [] + for line in lines: + if 'SST' in line: + self.xlsx_keys['方案编号'] = line + + valid_time = self.is_valid_time(line) + if valid_time: + valid_time_list.append(valid_time) + if valid_time_list: + self.xlsx_keys['签发日期'] = max(valid_time_list).strftime("%Y-%m-%d") + def pdf_text(self, pdf_path): with pdfplumber.open(pdf_path) as pdf: - # 遍历每个页面 page = pdf.pages[0] # 提取页面文本 text = page.extract_text() lines = text.split("\n") line_str = '' + company = '' for line in lines: line_str += line if '报告编号:' in line: @@ -46,12 +72,17 @@ class Discern(object): self.xlsx_keys['样品名称'] = line.split("样品名称")[1].strip().replace(': ', '') if 'Article Name:' in line: self.xlsx_keys['样品名称'] = line.split("Article Name")[1].strip().replace(': ', '') - if '公司' in line and '中检华通' not in line and '制造商' not in line: - self.xlsx_keys['公司名称'] = line.strip() + if '公司' in line: + company += line.strip() if '最终报告' in line: self.xlsx_keys['检测项目'] = line_str.replace('最终报告', '') self.xlsx_keys['标志'] = '' + company_list = company.split(" ") + for company_str in company_list: + if '中检华通' not in company_str and '制造商' not in company_str: + self.xlsx_keys['公司名称'] += company_str + def pdf_images(self, pdf_path): self.num = 0 pdf_reader = PyPDF2.PdfReader(pdf_path) @@ -77,7 +108,7 @@ class Discern(object): try: text_dict = self.image_text_ocr.run(text_dict, f'../target_img/image_{i}.png') except cv2.error as c: - print(c) + self.log(c) pass if text_dict.get('标志'): @@ -115,10 +146,13 @@ class Discern(object): if entry.is_file(): file_path = entry.path file_name = entry.name + self.log(file_name) self.xlsx_keys['文件名'] = file_name self.pdf_text(file_path) self.pdf_images(file_path) self.get_images_text() + if not self.xlsx_keys['方案编号'] and not self.xlsx_keys['签发日期']: + self.pdf_all_text(file_path) self.xlsx_keys_list.append(self.xlsx_keys) self.export_excel(self.xlsx_keys_list) diff --git a/pdf处理/program/image_text_ocr.py b/pdf处理/program/image_text_ocr.py index 8fc14ed..1bef1c9 100644 --- a/pdf处理/program/image_text_ocr.py +++ b/pdf处理/program/image_text_ocr.py @@ -1,20 +1,22 @@ from datetime import datetime +from base import Base from PIL import Image import pytesseract import platform -class ImageTextOcr(object): +class ImageTextOcr(Base): def __init__(self): + super(ImageTextOcr, self).__init__() current_os = platform.system() if current_os == 'Windows': pytesseract.pytesseract.tesseract_cmd = r'E:\pc\tesseract-ocr\tesseract.exe' - print("当前操作系统是 Windows") + self.log("当前操作系统是 Windows") elif current_os == 'Linux': - print("当前操作系统是 Ubuntu") + self.log("当前操作系统是 Ubuntu") else: - print(f"当前操作系统是 {current_os}") + self.log(f"当前操作系统是 {current_os}") def is_valid_time(self, input_str): try: @@ -28,22 +30,20 @@ class ImageTextOcr(object): # 加载图像 image = Image.open(path) result = pytesseract.image_to_string(image, config=r'--oem 3 --psm 6 -l chi_sim+eng') - lines = result.split('\n') + lines = result.split() for line in lines: - data_list = line.split(' ') - for data in data_list: - if 'S$T' in data or 'SST' in data: - text_dict['方案编号'] = data.replace('S$T', 'SST') + if 'S$T' in line or 'SST' in line: + text_dict['方案编号'] = line.replace('S$T', 'SST').replace('试验方案编号:', '') - if 'CNAS' in data: - text_dict['标志'] = 'cnas中文,' + if 'CNAS' in line: + text_dict['标志'] = 'cnas中文,' - if '200015344424' in data: - text_dict['标志'] = '国cma,' + if '200015344424' in line: + text_dict['标志'] = '国cma,' - valid_time = self.is_valid_time(data) - if valid_time: - valid_time_list.append(valid_time) + valid_time = self.is_valid_time(line) + if valid_time: + valid_time_list.append(valid_time) if valid_time_list: text_dict['签发日期'] = max(valid_time_list).strftime("%Y-%m-%d") @@ -61,5 +61,5 @@ if __name__ == '__main__': '签发日期': '', '标志': '' } - res = image_text_ocr.run(text_dict, '../target_img/image_3.png') + res = image_text_ocr.run(text_dict, '../target_img/image_5.png') print(res)