pdf 处理

This commit is contained in:
aiyingfeng 2023-07-25 12:20:16 +08:00
parent c6ddc2c570
commit dc82b5c03d
2 changed files with 28 additions and 10 deletions

View File

@ -75,6 +75,9 @@ class Discern(object):
cnas_flag = image_compare.run(f'../target_img/image_{i}.png', '../img/cnas.png') cnas_flag = image_compare.run(f'../target_img/image_{i}.png', '../img/cnas.png')
text_list = self.image_text_ocr.run(f'../target_img/image_{i}.png') text_list = self.image_text_ocr.run(f'../target_img/image_{i}.png')
except cv2.error as c: except cv2.error as c:
cma_flag = ''
cnas_flag = ''
text_list = ''
pass pass
if cma_flag: if cma_flag:
self.xlsx_keys['标志'] = '国cma' self.xlsx_keys['标志'] = '国cma'

View File

@ -1,3 +1,4 @@
from datetime import datetime
from PIL import Image from PIL import Image
import pytesseract import pytesseract
import platform import platform
@ -15,27 +16,41 @@ class ImageTextOcr(object):
else: else:
print(f"当前操作系统是 {current_os}") print(f"当前操作系统是 {current_os}")
def is_valid_time(self, input_str):
try:
valid_time = datetime.strptime(input_str, "%Y-%m-%d") # 根据实际时间格式调整
return valid_time
except ValueError:
return False
def image_text_ocr(self, path): def image_text_ocr(self, path):
text_list = [] text_list = []
valid_time_list = []
# 加载图像 # 加载图像
image = Image.open(path) image = Image.open(path)
result = pytesseract.image_to_string(image, lang='chi_sim', output_type=pytesseract.Output.STRING) result = pytesseract.image_to_string(image, config=r'--oem 3 --psm 6 -l chi_sim+eng')
lines = result.split('\n') lines = result.split('\n')
for line in lines: for line in lines:
data_list = line.split(' ') data_list = line.split(' ')
for data in data_list: for data in data_list:
if '试验方案编号' in data: if 'S$T' in data or 'SST' in data:
text_list.append(data_list[-1]) text_list.append(data.replace('S$T', 'SST'))
if '报告完成日期' in data: valid_time = self.is_valid_time(data)
text_list.append(data_list[-1]) if valid_time:
return text_list valid_time_list.append(valid_time)
if valid_time_list:
text_list.append(max(valid_time_list).strftime("%Y-%m-%d"))
if len(text_list) == 2:
return text_list
else:
return []
def run(self, path): def run(self, path):
text_list = self.image_text_ocr(path) res_list = self.image_text_ocr(path)
return text_list return res_list
if __name__ == '__main__': if __name__ == '__main__':
image_text_ocr = ImageTextOcr() image_text_ocr = ImageTextOcr()
text_list = image_text_ocr.run('../target_img/image_7.jpg') res = image_text_ocr.run('../target_img/image_3.png')
print(text_list) print(res)