mirror of
https://github.com/luzhisheng/js_reverse.git
synced 2025-04-22 14:05:22 +08:00
pdf处理
This commit is contained in:
parent
f336578fe2
commit
afc5efc175
@ -1,6 +1,7 @@
|
|||||||
import image_compare
|
import image_compare
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import pdfplumber
|
import pdfplumber
|
||||||
|
from image_text_ocr import ImageTextOcr
|
||||||
import os
|
import os
|
||||||
import cv2
|
import cv2
|
||||||
|
|
||||||
@ -8,13 +9,13 @@ import cv2
|
|||||||
class Discern(object):
|
class Discern(object):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
self.image_text_ocr = ImageTextOcr()
|
||||||
self.xlsx_keys = {}
|
self.xlsx_keys = {}
|
||||||
self.xlsx_keys_list = []
|
|
||||||
self.num = 0
|
self.num = 0
|
||||||
|
|
||||||
def export_excel(self, export):
|
def export_excel(self, export):
|
||||||
# 将字典列表转换为DataFrame
|
# 将字典列表转换为DataFrame
|
||||||
pf = pd.DataFrame(list(export))
|
pf = pd.DataFrame(list([export]))
|
||||||
file_path = pd.ExcelWriter('../docs/结果.xlsx')
|
file_path = pd.ExcelWriter('../docs/结果.xlsx')
|
||||||
# 替换空单元格
|
# 替换空单元格
|
||||||
pf.fillna(' ', inplace=True)
|
pf.fillna(' ', inplace=True)
|
||||||
@ -59,17 +60,21 @@ class Discern(object):
|
|||||||
with open(image_file, "wb") as f:
|
with open(image_file, "wb") as f:
|
||||||
f.write(img['stream'].get_data())
|
f.write(img['stream'].get_data())
|
||||||
|
|
||||||
def get_sign(self):
|
def get_images_text(self):
|
||||||
for i in range(1, self.num + 1):
|
for i in range(1, self.num + 1):
|
||||||
try:
|
try:
|
||||||
cma_flag = image_compare.run(f'../target_img/image_{i}.jpg', '../img/cma.jpg')
|
cma_flag = image_compare.run(f'../target_img/image_{i}.jpg', '../img/cma.jpg')
|
||||||
cnas_flag = image_compare.run(f'../target_img/image_{i}.jpg', '../img/cnas.jpg')
|
cnas_flag = image_compare.run(f'../target_img/image_{i}.jpg', '../img/cnas.jpg')
|
||||||
|
text_list = self.image_text_ocr.run(f'../target_img/image_{i}.jpg')
|
||||||
except cv2.error as c:
|
except cv2.error as c:
|
||||||
pass
|
pass
|
||||||
if cma_flag:
|
if cma_flag:
|
||||||
self.xlsx_keys['标志'] = '国cma'
|
self.xlsx_keys['标志'] = '国cma'
|
||||||
if cnas_flag:
|
if cnas_flag:
|
||||||
self.xlsx_keys['标志'] += ',cnas中文'
|
self.xlsx_keys['标志'] += ',cnas中文'
|
||||||
|
if text_list:
|
||||||
|
self.xlsx_keys['方案编号'] = text_list[0]
|
||||||
|
self.xlsx_keys['签发日期'] = text_list[1]
|
||||||
|
|
||||||
def remove_file(self, folder_path):
|
def remove_file(self, folder_path):
|
||||||
with os.scandir(folder_path) as entries:
|
with os.scandir(folder_path) as entries:
|
||||||
@ -102,9 +107,8 @@ class Discern(object):
|
|||||||
self.xlsx_keys['文件名'] = file_name
|
self.xlsx_keys['文件名'] = file_name
|
||||||
self.pdf_text(file_path)
|
self.pdf_text(file_path)
|
||||||
self.pdf_images(file_path)
|
self.pdf_images(file_path)
|
||||||
self.get_sign()
|
self.get_images_text()
|
||||||
self.xlsx_keys_list.append(self.xlsx_keys)
|
self.export_excel(self.xlsx_keys)
|
||||||
self.export_excel(self.xlsx_keys_list)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -1,9 +0,0 @@
|
|||||||
from PIL import Image
|
|
||||||
import pytesseract
|
|
||||||
|
|
||||||
# 加载图像
|
|
||||||
image = Image.open('../target_img/image_7.jpg')
|
|
||||||
# 列出支持的语言
|
|
||||||
print(pytesseract.get_languages(config=''))
|
|
||||||
text = pytesseract.image_to_string(image, lang='chi_sim')
|
|
||||||
print(text)
|
|
41
pdf处理/program/image_text_ocr.py
Normal file
41
pdf处理/program/image_text_ocr.py
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
from PIL import Image
|
||||||
|
import pytesseract
|
||||||
|
import platform
|
||||||
|
|
||||||
|
|
||||||
|
class ImageTextOcr(object):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
current_os = platform.system()
|
||||||
|
if current_os == 'Windows':
|
||||||
|
pytesseract.pytesseract.tesseract_cmd = r'E:\pc\tesseract-ocr\tesseract.exe'
|
||||||
|
print("当前操作系统是 Windows")
|
||||||
|
elif current_os == 'Linux':
|
||||||
|
print("当前操作系统是 Ubuntu")
|
||||||
|
else:
|
||||||
|
print(f"当前操作系统是 {current_os}")
|
||||||
|
|
||||||
|
def image_text_ocr(self, path):
|
||||||
|
text_list = []
|
||||||
|
# 加载图像
|
||||||
|
image = Image.open(path)
|
||||||
|
result = pytesseract.image_to_string(image, lang='chi_sim', output_type=pytesseract.Output.STRING)
|
||||||
|
lines = result.split('\n')
|
||||||
|
for line in lines:
|
||||||
|
data_list = line.split(' ')
|
||||||
|
for data in data_list:
|
||||||
|
if '试验方案编号' in data:
|
||||||
|
text_list.append(data_list[-1])
|
||||||
|
if '报告完成日期' in data:
|
||||||
|
text_list.append(data_list[-1])
|
||||||
|
return text_list
|
||||||
|
|
||||||
|
def run(self, path):
|
||||||
|
text_list = self.image_text_ocr(path)
|
||||||
|
return text_list
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
image_text_ocr = ImageTextOcr()
|
||||||
|
text_list = image_text_ocr.run('../target_img/image_7.jpg')
|
||||||
|
print(text_list)
|
Loading…
x
Reference in New Issue
Block a user