js_reverse/pdf处理/program/pdf_base.py

from datetime import datetime
from program.extract_from_pages import read_pdf
from base import Base
from PIL import Image
import pandas as pd
import pytesseract
import platform
import os


class PDFBase(Base):

    def __init__(self):
        super(Base, self).__init__()
        current_os = platform.system()
        if current_os == 'Windows':
            pytesseract.pytesseract.tesseract_cmd = r'E:\pc\tesseract-ocr\tesseract.exe'
            self.log("当前操作系统是 Windows")
        elif current_os == 'Linux':
            self.log("当前操作系统是 Ubuntu")
        else:
            self.log(f"当前操作系统是 {current_os}")

    def download_img(self, input_pdf, output_image):
        """
        下载pdf中全部图片
        :param input_pdf:
        :param output_image:
        :return:
        """
        try:
            read_pdf(input_pdf, output_image)
        except Exception as e:
            self.log(f"出现异常：{e}")

    @staticmethod
    def read_img_ocr_binarization(img_path, standard=205):
        """
        二值化读取图片中文字内容
        :param img_path:
        :return:
        """
        img = Image.open(img_path)
        # 在将图片灰度转换，二值化
        img = img.convert('L')
        pixels = img.load()
        for x in range(img.width):
            for y in range(img.height):
                if pixels[x, y] > standard:
                    pixels[x, y] = 255
                else:
                    pixels[x, y] = 0
        # 图像识别
        result = pytesseract.image_to_string(img, config=r'--oem 3 --psm 6 -l chi_sim+eng')
        lines = result.split()
        return lines

    @staticmethod
    def read_img_ocr(img_path):
        """
        读取图片中文字内容
        :param img_path:
        :return:
        """
        img = Image.open(img_path)
        # 图像识别
        result = pytesseract.image_to_string(img, config=r'--oem 3 --psm 6 -l chi_sim+eng')
        lines = result.split()
        return lines

    def remove_img(self, img_path):
        """
        删除当前文件夹下所有的图片
        :param img_path:
        :return:
        """
        with os.scandir(img_path) as entries:
            for entry in entries:
                if entry.is_file():
                    file_path = entry.path
                    try:
                        os.remove(file_path)
                    except Exception as e:
                        self.log(f"错误信息：{e}")

    @staticmethod
    def is_valid_time(input_str):
        """
        判断是否是时间格式
        :param input_str:
        :return:
        """
        try:
            valid_time = datetime.strptime(input_str, "%Y-%m-%d")
            return valid_time
        except ValueError:
            return False

    @staticmethod
    def export_excel(export, excel_path):
        """
        将字典列表转换为DataFrame
        :param export:
        :return:
        """
        pf = pd.DataFrame(list(export))
        current_time = datetime.now()
        formatted_time = current_time.strftime('%Y-%m-%d-%H-%M-%S')
        file_path = pd.ExcelWriter(f'{excel_path}/无源{formatted_time}.xlsx')
        # 替换空单元格
        pf.fillna(' ', inplace=True)
        # 输出
        pf = pf.sort_values(by='文件名')
        pf.to_excel(file_path, index=False)
        # 保存表格
        file_path.close()


if __name__ == '__main__':
    pdf_base = PDFBase()
    # pdf_base.download_img('../file_test/1.pdf', '../target_img/')
    res = pdf_base.read_img_ocr('../target_img/image-017.png')
    print(res)