diff --git a/pdf处理/program/discern.py b/pdf处理/program/discern.py index 32fe3cc..324e8bb 100644 --- a/pdf处理/program/discern.py +++ b/pdf处理/program/discern.py @@ -75,6 +75,9 @@ class Discern(object): cnas_flag = image_compare.run(f'../target_img/image_{i}.png', '../img/cnas.png') text_list = self.image_text_ocr.run(f'../target_img/image_{i}.png') except cv2.error as c: + cma_flag = '' + cnas_flag = '' + text_list = '' pass if cma_flag: self.xlsx_keys['标志'] = '国cma' diff --git a/pdf处理/program/image_text_ocr.py b/pdf处理/program/image_text_ocr.py index 28fb4e8..81f447e 100644 --- a/pdf处理/program/image_text_ocr.py +++ b/pdf处理/program/image_text_ocr.py @@ -1,3 +1,4 @@ +from datetime import datetime from PIL import Image import pytesseract import platform @@ -15,27 +16,41 @@ class ImageTextOcr(object): else: print(f"当前操作系统是 {current_os}") + def is_valid_time(self, input_str): + try: + valid_time = datetime.strptime(input_str, "%Y-%m-%d") # 根据实际时间格式调整 + return valid_time + except ValueError: + return False + def image_text_ocr(self, path): text_list = [] + valid_time_list = [] # 加载图像 image = Image.open(path) - result = pytesseract.image_to_string(image, lang='chi_sim', output_type=pytesseract.Output.STRING) + result = pytesseract.image_to_string(image, config=r'--oem 3 --psm 6 -l chi_sim+eng') lines = result.split('\n') for line in lines: data_list = line.split(' ') for data in data_list: - if '试验方案编号' in data: - text_list.append(data_list[-1]) - if '报告完成日期' in data: - text_list.append(data_list[-1]) - return text_list + if 'S$T' in data or 'SST' in data: + text_list.append(data.replace('S$T', 'SST')) + valid_time = self.is_valid_time(data) + if valid_time: + valid_time_list.append(valid_time) + if valid_time_list: + text_list.append(max(valid_time_list).strftime("%Y-%m-%d")) + if len(text_list) == 2: + return text_list + else: + return [] def run(self, path): - text_list = self.image_text_ocr(path) - return text_list + res_list = self.image_text_ocr(path) + return res_list if __name__ == '__main__': image_text_ocr = ImageTextOcr() - text_list = image_text_ocr.run('../target_img/image_7.jpg') - print(text_list) + res = image_text_ocr.run('../target_img/image_3.png') + print(res)