pdf处理

This commit is contained in:
luzhisheng 2023-07-26 01:16:23 +08:00
parent df6cf1faa5
commit d96dde1f9c
3 changed files with 67 additions and 23 deletions

10
pdf处理/base.py Normal file
View File

@ -0,0 +1,10 @@
from datetime import datetime
class Base(object):
def __init__(self):
pass
def log(self, s):
print('%s%s' % (datetime.now(), s), flush=True)

View File

@ -2,14 +2,16 @@ import pandas as pd
import pdfplumber import pdfplumber
import PyPDF2 import PyPDF2
from datetime import datetime from datetime import datetime
from base import Base
from image_text_ocr import ImageTextOcr from image_text_ocr import ImageTextOcr
import os import os
import cv2 import cv2
class Discern(object): class Discern(Base):
def __init__(self): def __init__(self):
super(Discern, self).__init__()
self.image_text_ocr = ImageTextOcr() self.image_text_ocr = ImageTextOcr()
self.xlsx_keys = {} self.xlsx_keys = {}
self.xlsx_keys_list = [] self.xlsx_keys_list = []
@ -19,7 +21,7 @@ class Discern(object):
# 将字典列表转换为DataFrame # 将字典列表转换为DataFrame
pf = pd.DataFrame(list(export)) pf = pd.DataFrame(list(export))
current_time = datetime.now() current_time = datetime.now()
formatted_time = current_time.strftime('%Y-%m-%d %H:%M:%S') formatted_time = current_time.strftime('%Y-%m-%d-%H-%M-%S')
file_path = pd.ExcelWriter(f'../docs/{formatted_time}.xlsx') file_path = pd.ExcelWriter(f'../docs/{formatted_time}.xlsx')
# 替换空单元格 # 替换空单元格
pf.fillna(' ', inplace=True) pf.fillna(' ', inplace=True)
@ -28,14 +30,38 @@ class Discern(object):
# 保存表格 # 保存表格
file_path.close() file_path.close()
def is_valid_time(self, input_str):
try:
valid_time = datetime.strptime(input_str, "%Y-%m-%d") # 根据实际时间格式调整
return valid_time
except ValueError:
return False
def pdf_all_text(self, pdf_path):
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages[1:]:
# 提取页面文本
text = page.extract_text()
lines = text.split()
valid_time_list = []
for line in lines:
if 'SST' in line:
self.xlsx_keys['方案编号'] = line
valid_time = self.is_valid_time(line)
if valid_time:
valid_time_list.append(valid_time)
if valid_time_list:
self.xlsx_keys['签发日期'] = max(valid_time_list).strftime("%Y-%m-%d")
def pdf_text(self, pdf_path): def pdf_text(self, pdf_path):
with pdfplumber.open(pdf_path) as pdf: with pdfplumber.open(pdf_path) as pdf:
# 遍历每个页面
page = pdf.pages[0] page = pdf.pages[0]
# 提取页面文本 # 提取页面文本
text = page.extract_text() text = page.extract_text()
lines = text.split("\n") lines = text.split("\n")
line_str = '' line_str = ''
company = ''
for line in lines: for line in lines:
line_str += line line_str += line
if '报告编号:' in line: if '报告编号:' in line:
@ -46,12 +72,17 @@ class Discern(object):
self.xlsx_keys['样品名称'] = line.split("样品名称")[1].strip().replace(': ', '') self.xlsx_keys['样品名称'] = line.split("样品名称")[1].strip().replace(': ', '')
if 'Article Name:' in line: if 'Article Name:' in line:
self.xlsx_keys['样品名称'] = line.split("Article Name")[1].strip().replace(': ', '') self.xlsx_keys['样品名称'] = line.split("Article Name")[1].strip().replace(': ', '')
if '公司' in line and '中检华通' not in line and '制造商' not in line: if '公司' in line:
self.xlsx_keys['公司名称'] = line.strip() company += line.strip()
if '最终报告' in line: if '最终报告' in line:
self.xlsx_keys['检测项目'] = line_str.replace('最终报告', '') self.xlsx_keys['检测项目'] = line_str.replace('最终报告', '')
self.xlsx_keys['标志'] = '' self.xlsx_keys['标志'] = ''
company_list = company.split(" ")
for company_str in company_list:
if '中检华通' not in company_str and '制造商' not in company_str:
self.xlsx_keys['公司名称'] += company_str
def pdf_images(self, pdf_path): def pdf_images(self, pdf_path):
self.num = 0 self.num = 0
pdf_reader = PyPDF2.PdfReader(pdf_path) pdf_reader = PyPDF2.PdfReader(pdf_path)
@ -77,7 +108,7 @@ class Discern(object):
try: try:
text_dict = self.image_text_ocr.run(text_dict, f'../target_img/image_{i}.png') text_dict = self.image_text_ocr.run(text_dict, f'../target_img/image_{i}.png')
except cv2.error as c: except cv2.error as c:
print(c) self.log(c)
pass pass
if text_dict.get('标志'): if text_dict.get('标志'):
@ -115,10 +146,13 @@ class Discern(object):
if entry.is_file(): if entry.is_file():
file_path = entry.path file_path = entry.path
file_name = entry.name file_name = entry.name
self.log(file_name)
self.xlsx_keys['文件名'] = file_name self.xlsx_keys['文件名'] = file_name
self.pdf_text(file_path) self.pdf_text(file_path)
self.pdf_images(file_path) self.pdf_images(file_path)
self.get_images_text() self.get_images_text()
if not self.xlsx_keys['方案编号'] and not self.xlsx_keys['签发日期']:
self.pdf_all_text(file_path)
self.xlsx_keys_list.append(self.xlsx_keys) self.xlsx_keys_list.append(self.xlsx_keys)
self.export_excel(self.xlsx_keys_list) self.export_excel(self.xlsx_keys_list)

View File

@ -1,20 +1,22 @@
from datetime import datetime from datetime import datetime
from base import Base
from PIL import Image from PIL import Image
import pytesseract import pytesseract
import platform import platform
class ImageTextOcr(object): class ImageTextOcr(Base):
def __init__(self): def __init__(self):
super(ImageTextOcr, self).__init__()
current_os = platform.system() current_os = platform.system()
if current_os == 'Windows': if current_os == 'Windows':
pytesseract.pytesseract.tesseract_cmd = r'E:\pc\tesseract-ocr\tesseract.exe' pytesseract.pytesseract.tesseract_cmd = r'E:\pc\tesseract-ocr\tesseract.exe'
print("当前操作系统是 Windows") self.log("当前操作系统是 Windows")
elif current_os == 'Linux': elif current_os == 'Linux':
print("当前操作系统是 Ubuntu") self.log("当前操作系统是 Ubuntu")
else: else:
print(f"当前操作系统是 {current_os}") self.log(f"当前操作系统是 {current_os}")
def is_valid_time(self, input_str): def is_valid_time(self, input_str):
try: try:
@ -28,20 +30,18 @@ class ImageTextOcr(object):
# 加载图像 # 加载图像
image = Image.open(path) image = Image.open(path)
result = pytesseract.image_to_string(image, config=r'--oem 3 --psm 6 -l chi_sim+eng') result = pytesseract.image_to_string(image, config=r'--oem 3 --psm 6 -l chi_sim+eng')
lines = result.split('\n') lines = result.split()
for line in lines: for line in lines:
data_list = line.split(' ') if 'S$T' in line or 'SST' in line:
for data in data_list: text_dict['方案编号'] = line.replace('S$T', 'SST').replace('试验方案编号:', '')
if 'S$T' in data or 'SST' in data:
text_dict['方案编号'] = data.replace('S$T', 'SST')
if 'CNAS' in data: if 'CNAS' in line:
text_dict['标志'] = 'cnas中文,' text_dict['标志'] = 'cnas中文,'
if '200015344424' in data: if '200015344424' in line:
text_dict['标志'] = '国cma,' text_dict['标志'] = '国cma,'
valid_time = self.is_valid_time(data) valid_time = self.is_valid_time(line)
if valid_time: if valid_time:
valid_time_list.append(valid_time) valid_time_list.append(valid_time)
@ -61,5 +61,5 @@ if __name__ == '__main__':
'签发日期': '', '签发日期': '',
'标志': '' '标志': ''
} }
res = image_text_ocr.run(text_dict, '../target_img/image_3.png') res = image_text_ocr.run(text_dict, '../target_img/image_5.png')
print(res) print(res)