2023-08-01 14:24:53 +08:00

102 lines
2.9 KiB
Python

from datetime import datetime
from base import Base
import subprocess
from PIL import Image
import pandas as pd
import pytesseract
import platform
import os
class PDFBase(Base):
def __init__(self):
super(Base, self).__init__()
current_os = platform.system()
if current_os == 'Windows':
pytesseract.pytesseract.tesseract_cmd = r'E:\pc\tesseract-ocr\tesseract.exe'
self.log("当前操作系统是 Windows")
elif current_os == 'Linux':
self.log("当前操作系统是 Ubuntu")
else:
self.log(f"当前操作系统是 {current_os}")
def download_img(self, input_pdf, output_image):
"""
下载pdf中全部图片
:param input_pdf:
:param output_image:
:return:
"""
try:
subprocess.run(f"pdfimages -png '{input_pdf}' {output_image}/image", shell=True,
capture_output=True, text=True)
except Exception as e:
self.log(f"出现异常:{e}")
@staticmethod
def read_img_ocr(img_path):
"""
读取图片中文字内容
:param img_path:
:return:
"""
image = Image.open(img_path)
result = pytesseract.image_to_string(image, config=r'--oem 3 --psm 6 -l chi_sim+eng')
lines = result.split()
return lines
def remove_img(self, img_path):
"""
删除当前文件夹下所有的图片
:param img_path:
:return:
"""
with os.scandir(img_path) as entries:
for entry in entries:
if entry.is_file():
file_path = entry.path
try:
os.remove(file_path)
except Exception as e:
self.log(f"错误信息:{e}")
@staticmethod
def is_valid_time(input_str):
"""
判断是否是时间格式
:param input_str:
:return:
"""
try:
valid_time = datetime.strptime(input_str, "%Y-%m-%d")
return valid_time
except ValueError:
return False
@staticmethod
def export_excel(export, excel_path):
"""
将字典列表转换为DataFrame
:param export:
:return:
"""
pf = pd.DataFrame(list(export))
current_time = datetime.now()
formatted_time = current_time.strftime('%Y-%m-%d-%H-%M-%S')
file_path = pd.ExcelWriter(f'{excel_path}/无源{formatted_time}.xlsx')
# 替换空单元格
pf.fillna(' ', inplace=True)
# 输出
pf = pf.sort_values(by='样品名称')
pf.to_excel(file_path, index=False)
# 保存表格
file_path.close()
if __name__ == '__main__':
pdf_base = PDFBase()
# pdf_base.download_img('../file_test/1.pdf', '../target_img/')
res = pdf_base.read_img_ocr('../target_img/image-017.png')
print(res)