diff --git a/pdf处理/main.py b/pdf处理/main.py index d84482b..1672e23 100644 --- a/pdf处理/main.py +++ b/pdf处理/main.py @@ -1,4 +1,4 @@ -from program.discern import Discern +from program.testing_agency_report import TestingAgencyReport -discern = Discern() -discern.run('./file_test') +testing_agency_report = TestingAgencyReport() +testing_agency_report.run('./file', './target_img', './docs') diff --git a/pdf处理/program/image_text_ocr.py b/pdf处理/program/image_text_ocr.py deleted file mode 100644 index f73ea6c..0000000 --- a/pdf处理/program/image_text_ocr.py +++ /dev/null @@ -1,65 +0,0 @@ -from datetime import datetime -from base import Base -from PIL import Image -import pytesseract -import platform - - -class ImageTextOcr(Base): - - def __init__(self): - super(ImageTextOcr, self).__init__() - current_os = platform.system() - if current_os == 'Windows': - pytesseract.pytesseract.tesseract_cmd = r'E:\pc\tesseract-ocr\tesseract.exe' - self.log("当前操作系统是 Windows") - elif current_os == 'Linux': - self.log("当前操作系统是 Ubuntu") - else: - self.log(f"当前操作系统是 {current_os}") - - def is_valid_time(self, input_str): - try: - valid_time = datetime.strptime(input_str, "%Y-%m-%d") # 根据实际时间格式调整 - return valid_time - except ValueError: - return False - - def image_text_ocr(self, text_dict, path): - valid_time_list = [] - # 加载图像 - image = Image.open(path) - result = pytesseract.image_to_string(image, config=r'--oem 3 --psm 6 -l chi_sim+eng') - lines = result.split() - for line in lines: - if 'S$T' in line or 'SST' in line: - text_dict['方案编号'] = line.replace('S$T', 'SST').replace('试验方案编号:', '') - - if 'CNAS' in line: - text_dict['标志'] = 'cnas中文,' - - if '200015344424' in line: - text_dict['标志'] = '国cma,' - - valid_time = self.is_valid_time(line) - if valid_time: - valid_time_list.append(valid_time) - - if valid_time_list: - text_dict['签发日期'] = max(valid_time_list).strftime("%Y-%m-%d") - return text_dict - - def run(self, text_dict, path): - res_list = self.image_text_ocr(text_dict, path) - return res_list - - -if __name__ == '__main__': - image_text_ocr = ImageTextOcr() - text_dict = { - '方案编号': '', - '签发日期': '', - '标志': '' - } - res = image_text_ocr.run(text_dict, '../target_img/output_image-003.png') - print(res) diff --git a/pdf处理/program/pdf_base.py b/pdf处理/program/pdf_base.py new file mode 100644 index 0000000..5e1654c --- /dev/null +++ b/pdf处理/program/pdf_base.py @@ -0,0 +1,101 @@ +from datetime import datetime +from base import Base +import subprocess +from PIL import Image +import pandas as pd +import pytesseract +import platform +import os + + +class PDFBase(Base): + + def __init__(self): + super(Base, self).__init__() + current_os = platform.system() + if current_os == 'Windows': + pytesseract.pytesseract.tesseract_cmd = r'E:\pc\tesseract-ocr\tesseract.exe' + self.log("当前操作系统是 Windows") + elif current_os == 'Linux': + self.log("当前操作系统是 Ubuntu") + else: + self.log(f"当前操作系统是 {current_os}") + + def download_img(self, input_pdf, output_image): + """ + 下载pdf中全部图片 + :param input_pdf: + :param output_image: + :return: + """ + try: + subprocess.run(f"pdfimages -png '{input_pdf}' {output_image}/image", shell=True, + capture_output=True, text=True) + except Exception as e: + self.log(f"出现异常:{e}") + + @staticmethod + def read_img_ocr(img_path): + """ + 读取图片中文字内容 + :param img_path: + :return: + """ + image = Image.open(img_path) + result = pytesseract.image_to_string(image, config=r'--oem 3 --psm 6 -l chi_sim+eng') + lines = result.split() + return lines + + def remove_img(self, img_path): + """ + 删除当前文件夹下所有的图片 + :param img_path: + :return: + """ + with os.scandir(img_path) as entries: + for entry in entries: + if entry.is_file(): + file_path = entry.path + try: + os.remove(file_path) + except Exception as e: + self.log(f"错误信息:{e}") + + @staticmethod + def is_valid_time(input_str): + """ + 判断是否是时间格式 + :param input_str: + :return: + """ + try: + valid_time = datetime.strptime(input_str, "%Y-%m-%d") + return valid_time + except ValueError: + return False + + @staticmethod + def export_excel(export, excel_path): + """ + 将字典列表转换为DataFrame + :param export: + :return: + """ + pf = pd.DataFrame(list(export)) + current_time = datetime.now() + formatted_time = current_time.strftime('%Y-%m-%d-%H-%M-%S') + file_path = pd.ExcelWriter(f'{excel_path}/无源{formatted_time}.xlsx') + # 替换空单元格 + pf.fillna(' ', inplace=True) + # 输出 + pf = pf.sort_values(by='样品名称') + pf.to_excel(file_path, index=False) + # 保存表格 + file_path.close() + + +if __name__ == '__main__': + pdf_base = PDFBase() + # pdf_base.download_img('../file_test/1.pdf', '../target_img/') + res = pdf_base.read_img_ocr('../target_img/image-017.png') + print(res) diff --git a/pdf处理/program/discern.py b/pdf处理/program/testing_agency_report.py similarity index 52% rename from pdf处理/program/discern.py rename to pdf处理/program/testing_agency_report.py index 954690d..5dc441c 100644 --- a/pdf处理/program/discern.py +++ b/pdf处理/program/testing_agency_report.py @@ -1,67 +1,56 @@ -import pandas as pd -from PIL import Image +from program.pdf_base import PDFBase import pdfplumber -import re -import PyPDF2 -from datetime import datetime -from base import Base -from program.image_text_ocr import ImageTextOcr -import os import cv2 +import os +import re -class Discern(Base): +class TestingAgencyReport(PDFBase): def __init__(self): - super(Discern, self).__init__() - self.image_text_ocr = ImageTextOcr() + super(TestingAgencyReport, self).__init__() self.xlsx_keys = {} self.xlsx_keys_list = [] self.num = 0 - def export_excel(self, export): - # 将字典列表转换为DataFrame - pf = pd.DataFrame(list(export)) - current_time = datetime.now() - formatted_time = current_time.strftime('%Y-%m-%d-%H-%M-%S') - file_path = pd.ExcelWriter(f'./docs/无源{formatted_time}.xlsx') - # 替换空单元格 - pf.fillna(' ', inplace=True) - # 输出 - pf = pf.sort_values(by='样品名称') - pf.to_excel(file_path, index=False) - # 保存表格 - file_path.close() + def pdf_images(self, pdf_path, img_path): + self.download_img(pdf_path, img_path) + with os.scandir(img_path) as entries: + for entry in entries: + if entry.is_file(): + text_dict = { + '方案编号': '', + '签发日期': '', + '标志': '' + } - def is_valid_time(self, input_str): - try: - valid_time = datetime.strptime(input_str, "%Y-%m-%d") # 根据实际时间格式调整 - return valid_time - except ValueError: - return False + try: + lines = self.read_img_ocr(entry.path) + valid_time_list = [] + for line in lines: + if 'S$T' in line or 'SST' in line: + text_dict['方案编号'] = line.replace('S$T', 'SST').replace('试验方案编号:', '') - def pdf_all_text(self, pdf_path): - with pdfplumber.open(pdf_path) as pdf: - for page in pdf.pages[1:]: - # 提取页面文本 - text = page.extract_text() - lines = text.split() - valid_time_list = [] - for line in lines: - if 'SST' in line and not self.xlsx_keys['方案编号']: - self.xlsx_keys['方案编号'] = line + if 'CNAS' in line: + text_dict['标志'] = 'cnas中文,' - if '签发日期' in line and not self.xlsx_keys['签发日期']: - self.xlsx_keys['签发日期'] = line.replace('签发日期', '') + if '200015344424' in line: + text_dict['标志'] = '国cma,' - if 'GLP' in line: - self.xlsx_keys['标志'] += 'GLP,' + valid_time = self.is_valid_time(line) + if valid_time: + valid_time_list.append(valid_time) + if valid_time_list: + text_dict['签发日期'] = max(valid_time_list).strftime("%Y-%m-%d") + except cv2.error as c: + self.log(c) - valid_time = self.is_valid_time(line) - if valid_time: - valid_time_list.append(valid_time) - if valid_time_list: - self.xlsx_keys['签发日期'] = max(valid_time_list).strftime("%Y-%m-%d") + if text_dict.get('标志'): + self.xlsx_keys['标志'] += text_dict.get('标志') + if text_dict.get('方案编号'): + self.xlsx_keys['方案编号'] = text_dict.get('方案编号') + if text_dict.get('签发日期'): + self.xlsx_keys['签发日期'] = text_dict.get('签发日期') def pdf_text(self, pdf_path): with pdfplumber.open(pdf_path) as pdf: @@ -97,67 +86,32 @@ class Discern(Base): self.xlsx_keys['公司名称'] = self.xlsx_keys['公司名称'].replace('中检华通威国际检验(苏州)有限公司', ''). \ replace('中检华通威国际检验(苏州)有限公司', '') - def processing_image(self, img_file, standard=205): - """ 1.将图片进行降噪处理, 通过二值化去掉后面的背景色并加深文字对比度 """ - img = Image.open(img_file) - img = img.convert('L') - pixels = img.load() - for x in range(img.width): - for y in range(img.height): - if pixels[x, y] > standard: - pixels[x, y] = 255 - else: - pixels[x, y] = 0 - img.save(img_file) + def pdf_all_text(self, pdf_path): + with pdfplumber.open(pdf_path) as pdf: + for page in pdf.pages[1:]: + # 提取页面文本 + text = page.extract_text() + lines = text.split() + valid_time_list = [] + for line in lines: - def pdf_images(self, pdf_path): - self.num = 0 - pdf_reader = PyPDF2.PdfReader(pdf_path) - for page_num in range(len(pdf_reader.pages)): - page = pdf_reader.pages[page_num] - xObject = page['/Resources']['/XObject'].get_object() + if 'SST' in line and not self.xlsx_keys['方案编号']: + self.xlsx_keys['方案编号'] = line - for obj in xObject: - if xObject[obj]['/Subtype'] == '/Image': - self.num += 1 - image_file = f"./target_img/image_{self.num}.png" - with open(image_file, "wb") as f: - f.write(xObject[obj].get_data()) - if page_num != 0: - self.processing_image(image_file) + if '签发日期' in line and not self.xlsx_keys['签发日期']: + self.xlsx_keys['签发日期'] = line.replace('签发日期', '') - def get_images_text(self): - for i in range(1, self.num + 1): - text_dict = { - '方案编号': '', - '签发日期': '', - '标志': '' - } - try: - text_dict = self.image_text_ocr.run(text_dict, f'./target_img/image_{i}.png') - except cv2.error as c: - self.log(c) - pass + if 'GLP' in line: + self.xlsx_keys['标志'] += 'GLP,' - if text_dict.get('标志'): - self.xlsx_keys['标志'] += text_dict.get('标志') - if text_dict.get('方案编号'): - self.xlsx_keys['方案编号'] = text_dict.get('方案编号') - if text_dict.get('签发日期'): - self.xlsx_keys['签发日期'] = text_dict.get('签发日期') + valid_time = self.is_valid_time(line) + if valid_time: + valid_time_list.append(valid_time) + if valid_time_list: + self.xlsx_keys['签发日期'] = max(valid_time_list).strftime("%Y-%m-%d") - def remove_file(self, folder_path): - with os.scandir(folder_path) as entries: - for entry in entries: - if entry.is_file(): - file_path = entry.path - try: - os.remove(file_path) - except Exception as e: - pass - - def run(self, folder_path): - with os.scandir(folder_path) as entries: + def discern(self, pdf_path, img_path, excel_path): + with os.scandir(pdf_path) as entries: for entry in entries: self.xlsx_keys = { '登记日期': '', @@ -170,26 +124,30 @@ class Discern(Base): '签发日期': '', '公司名称': '' } - self.remove_file('./target_img') + self.remove_img(img_path) if entry.is_file(): - file_path = entry.path + pdf_path = entry.path file_name = entry.name self.log(file_name) self.xlsx_keys['文件名'] = file_name - self.pdf_text(file_path) - self.pdf_images(file_path) - self.get_images_text() - self.pdf_all_text(file_path) + self.pdf_text(pdf_path) + self.pdf_images(pdf_path, img_path) + + self.pdf_all_text(pdf_path) if not self.xlsx_keys['方案编号']: matches = re.findall(r'SST\d+BB', file_name) if matches: self.xlsx_keys['方案编号'] = matches[0] else: self.log("未找到匹配的模式方案编号") + self.xlsx_keys_list.append(self.xlsx_keys) - self.export_excel(self.xlsx_keys_list) + self.export_excel(self.xlsx_keys_list, excel_path) + + def run(self, pdf_path, img_path, excel_path): + self.discern(pdf_path, img_path, excel_path) if __name__ == '__main__': - discern = Discern() - discern.run('./file_test') + testing_agency_report = TestingAgencyReport() + testing_agency_report.run('../file_test', '../target_img', '../docs')