mirror of
https://github.com/luzhisheng/js_reverse.git
synced 2025-04-21 15:05:18 +08:00
102 lines
2.9 KiB
Python
102 lines
2.9 KiB
Python
from datetime import datetime
|
|
from base import Base
|
|
import subprocess
|
|
from PIL import Image
|
|
import pandas as pd
|
|
import pytesseract
|
|
import platform
|
|
import os
|
|
|
|
|
|
class PDFBase(Base):
|
|
|
|
def __init__(self):
|
|
super(Base, self).__init__()
|
|
current_os = platform.system()
|
|
if current_os == 'Windows':
|
|
pytesseract.pytesseract.tesseract_cmd = r'E:\pc\tesseract-ocr\tesseract.exe'
|
|
self.log("当前操作系统是 Windows")
|
|
elif current_os == 'Linux':
|
|
self.log("当前操作系统是 Ubuntu")
|
|
else:
|
|
self.log(f"当前操作系统是 {current_os}")
|
|
|
|
def download_img(self, input_pdf, output_image):
|
|
"""
|
|
下载pdf中全部图片
|
|
:param input_pdf:
|
|
:param output_image:
|
|
:return:
|
|
"""
|
|
try:
|
|
subprocess.run(f"pdfimages -png '{input_pdf}' {output_image}/image", shell=True,
|
|
capture_output=True, text=True)
|
|
except Exception as e:
|
|
self.log(f"出现异常:{e}")
|
|
|
|
@staticmethod
|
|
def read_img_ocr(img_path):
|
|
"""
|
|
读取图片中文字内容
|
|
:param img_path:
|
|
:return:
|
|
"""
|
|
image = Image.open(img_path)
|
|
result = pytesseract.image_to_string(image, config=r'--oem 3 --psm 6 -l chi_sim+eng')
|
|
lines = result.split()
|
|
return lines
|
|
|
|
def remove_img(self, img_path):
|
|
"""
|
|
删除当前文件夹下所有的图片
|
|
:param img_path:
|
|
:return:
|
|
"""
|
|
with os.scandir(img_path) as entries:
|
|
for entry in entries:
|
|
if entry.is_file():
|
|
file_path = entry.path
|
|
try:
|
|
os.remove(file_path)
|
|
except Exception as e:
|
|
self.log(f"错误信息:{e}")
|
|
|
|
@staticmethod
|
|
def is_valid_time(input_str):
|
|
"""
|
|
判断是否是时间格式
|
|
:param input_str:
|
|
:return:
|
|
"""
|
|
try:
|
|
valid_time = datetime.strptime(input_str, "%Y-%m-%d")
|
|
return valid_time
|
|
except ValueError:
|
|
return False
|
|
|
|
@staticmethod
|
|
def export_excel(export, excel_path):
|
|
"""
|
|
将字典列表转换为DataFrame
|
|
:param export:
|
|
:return:
|
|
"""
|
|
pf = pd.DataFrame(list(export))
|
|
current_time = datetime.now()
|
|
formatted_time = current_time.strftime('%Y-%m-%d-%H-%M-%S')
|
|
file_path = pd.ExcelWriter(f'{excel_path}/无源{formatted_time}.xlsx')
|
|
# 替换空单元格
|
|
pf.fillna(' ', inplace=True)
|
|
# 输出
|
|
pf = pf.sort_values(by='样品名称')
|
|
pf.to_excel(file_path, index=False)
|
|
# 保存表格
|
|
file_path.close()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
pdf_base = PDFBase()
|
|
# pdf_base.download_img('../file_test/1.pdf', '../target_img/')
|
|
res = pdf_base.read_img_ocr('../target_img/image-017.png')
|
|
print(res)
|