diff --git a/pdf处理/program/discern.py b/pdf处理/program/discern.py index 9d9c718..30de6ed 100644 --- a/pdf处理/program/discern.py +++ b/pdf处理/program/discern.py @@ -1,6 +1,7 @@ import image_compare import pandas as pd import pdfplumber +from image_text_ocr import ImageTextOcr import os import cv2 @@ -8,13 +9,13 @@ import cv2 class Discern(object): def __init__(self): + self.image_text_ocr = ImageTextOcr() self.xlsx_keys = {} - self.xlsx_keys_list = [] self.num = 0 def export_excel(self, export): # 将字典列表转换为DataFrame - pf = pd.DataFrame(list(export)) + pf = pd.DataFrame(list([export])) file_path = pd.ExcelWriter('../docs/结果.xlsx') # 替换空单元格 pf.fillna(' ', inplace=True) @@ -59,17 +60,21 @@ class Discern(object): with open(image_file, "wb") as f: f.write(img['stream'].get_data()) - def get_sign(self): + def get_images_text(self): for i in range(1, self.num + 1): try: cma_flag = image_compare.run(f'../target_img/image_{i}.jpg', '../img/cma.jpg') cnas_flag = image_compare.run(f'../target_img/image_{i}.jpg', '../img/cnas.jpg') + text_list = self.image_text_ocr.run(f'../target_img/image_{i}.jpg') except cv2.error as c: pass if cma_flag: self.xlsx_keys['标志'] = '国cma' if cnas_flag: self.xlsx_keys['标志'] += ',cnas中文' + if text_list: + self.xlsx_keys['方案编号'] = text_list[0] + self.xlsx_keys['签发日期'] = text_list[1] def remove_file(self, folder_path): with os.scandir(folder_path) as entries: @@ -102,9 +107,8 @@ class Discern(object): self.xlsx_keys['文件名'] = file_name self.pdf_text(file_path) self.pdf_images(file_path) - self.get_sign() - self.xlsx_keys_list.append(self.xlsx_keys) - self.export_excel(self.xlsx_keys_list) + self.get_images_text() + self.export_excel(self.xlsx_keys) if __name__ == '__main__': diff --git a/pdf处理/program/image_ocr.py b/pdf处理/program/image_ocr.py deleted file mode 100644 index 6e6e5d9..0000000 --- a/pdf处理/program/image_ocr.py +++ /dev/null @@ -1,9 +0,0 @@ -from PIL import Image -import pytesseract - -# 加载图像 -image = Image.open('../target_img/image_7.jpg') -# 列出支持的语言 -print(pytesseract.get_languages(config='')) -text = pytesseract.image_to_string(image, lang='chi_sim') -print(text) diff --git a/pdf处理/program/image_text_ocr.py b/pdf处理/program/image_text_ocr.py new file mode 100644 index 0000000..28fb4e8 --- /dev/null +++ b/pdf处理/program/image_text_ocr.py @@ -0,0 +1,41 @@ +from PIL import Image +import pytesseract +import platform + + +class ImageTextOcr(object): + + def __init__(self): + current_os = platform.system() + if current_os == 'Windows': + pytesseract.pytesseract.tesseract_cmd = r'E:\pc\tesseract-ocr\tesseract.exe' + print("当前操作系统是 Windows") + elif current_os == 'Linux': + print("当前操作系统是 Ubuntu") + else: + print(f"当前操作系统是 {current_os}") + + def image_text_ocr(self, path): + text_list = [] + # 加载图像 + image = Image.open(path) + result = pytesseract.image_to_string(image, lang='chi_sim', output_type=pytesseract.Output.STRING) + lines = result.split('\n') + for line in lines: + data_list = line.split(' ') + for data in data_list: + if '试验方案编号' in data: + text_list.append(data_list[-1]) + if '报告完成日期' in data: + text_list.append(data_list[-1]) + return text_list + + def run(self, path): + text_list = self.image_text_ocr(path) + return text_list + + +if __name__ == '__main__': + image_text_ocr = ImageTextOcr() + text_list = image_text_ocr.run('../target_img/image_7.jpg') + print(text_list)