mirror of
https://github.com/luzhisheng/js_reverse.git
synced 2025-04-19 18:24:51 +08:00
pdf 处理
This commit is contained in:
parent
c6ddc2c570
commit
dc82b5c03d
@ -75,6 +75,9 @@ class Discern(object):
|
||||
cnas_flag = image_compare.run(f'../target_img/image_{i}.png', '../img/cnas.png')
|
||||
text_list = self.image_text_ocr.run(f'../target_img/image_{i}.png')
|
||||
except cv2.error as c:
|
||||
cma_flag = ''
|
||||
cnas_flag = ''
|
||||
text_list = ''
|
||||
pass
|
||||
if cma_flag:
|
||||
self.xlsx_keys['标志'] = '国cma'
|
||||
|
@ -1,3 +1,4 @@
|
||||
from datetime import datetime
|
||||
from PIL import Image
|
||||
import pytesseract
|
||||
import platform
|
||||
@ -15,27 +16,41 @@ class ImageTextOcr(object):
|
||||
else:
|
||||
print(f"当前操作系统是 {current_os}")
|
||||
|
||||
def is_valid_time(self, input_str):
|
||||
try:
|
||||
valid_time = datetime.strptime(input_str, "%Y-%m-%d") # 根据实际时间格式调整
|
||||
return valid_time
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
def image_text_ocr(self, path):
|
||||
text_list = []
|
||||
valid_time_list = []
|
||||
# 加载图像
|
||||
image = Image.open(path)
|
||||
result = pytesseract.image_to_string(image, lang='chi_sim', output_type=pytesseract.Output.STRING)
|
||||
result = pytesseract.image_to_string(image, config=r'--oem 3 --psm 6 -l chi_sim+eng')
|
||||
lines = result.split('\n')
|
||||
for line in lines:
|
||||
data_list = line.split(' ')
|
||||
for data in data_list:
|
||||
if '试验方案编号' in data:
|
||||
text_list.append(data_list[-1])
|
||||
if '报告完成日期' in data:
|
||||
text_list.append(data_list[-1])
|
||||
return text_list
|
||||
if 'S$T' in data or 'SST' in data:
|
||||
text_list.append(data.replace('S$T', 'SST'))
|
||||
valid_time = self.is_valid_time(data)
|
||||
if valid_time:
|
||||
valid_time_list.append(valid_time)
|
||||
if valid_time_list:
|
||||
text_list.append(max(valid_time_list).strftime("%Y-%m-%d"))
|
||||
if len(text_list) == 2:
|
||||
return text_list
|
||||
else:
|
||||
return []
|
||||
|
||||
def run(self, path):
|
||||
text_list = self.image_text_ocr(path)
|
||||
return text_list
|
||||
res_list = self.image_text_ocr(path)
|
||||
return res_list
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
image_text_ocr = ImageTextOcr()
|
||||
text_list = image_text_ocr.run('../target_img/image_7.jpg')
|
||||
print(text_list)
|
||||
res = image_text_ocr.run('../target_img/image_3.png')
|
||||
print(res)
|
||||
|
Loading…
x
Reference in New Issue
Block a user