mirror of
https://github.com/luzhisheng/js_reverse.git
synced 2025-04-19 22:49:54 +08:00
pdf处理
This commit is contained in:
parent
df6cf1faa5
commit
d96dde1f9c
10
pdf处理/base.py
Normal file
10
pdf处理/base.py
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
|
class Base(object):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def log(self, s):
|
||||||
|
print('【%s】 %s' % (datetime.now(), s), flush=True)
|
@ -2,14 +2,16 @@ import pandas as pd
|
|||||||
import pdfplumber
|
import pdfplumber
|
||||||
import PyPDF2
|
import PyPDF2
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from base import Base
|
||||||
from image_text_ocr import ImageTextOcr
|
from image_text_ocr import ImageTextOcr
|
||||||
import os
|
import os
|
||||||
import cv2
|
import cv2
|
||||||
|
|
||||||
|
|
||||||
class Discern(object):
|
class Discern(Base):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
super(Discern, self).__init__()
|
||||||
self.image_text_ocr = ImageTextOcr()
|
self.image_text_ocr = ImageTextOcr()
|
||||||
self.xlsx_keys = {}
|
self.xlsx_keys = {}
|
||||||
self.xlsx_keys_list = []
|
self.xlsx_keys_list = []
|
||||||
@ -19,7 +21,7 @@ class Discern(object):
|
|||||||
# 将字典列表转换为DataFrame
|
# 将字典列表转换为DataFrame
|
||||||
pf = pd.DataFrame(list(export))
|
pf = pd.DataFrame(list(export))
|
||||||
current_time = datetime.now()
|
current_time = datetime.now()
|
||||||
formatted_time = current_time.strftime('%Y-%m-%d %H:%M:%S')
|
formatted_time = current_time.strftime('%Y-%m-%d-%H-%M-%S')
|
||||||
file_path = pd.ExcelWriter(f'../docs/{formatted_time}.xlsx')
|
file_path = pd.ExcelWriter(f'../docs/{formatted_time}.xlsx')
|
||||||
# 替换空单元格
|
# 替换空单元格
|
||||||
pf.fillna(' ', inplace=True)
|
pf.fillna(' ', inplace=True)
|
||||||
@ -28,14 +30,38 @@ class Discern(object):
|
|||||||
# 保存表格
|
# 保存表格
|
||||||
file_path.close()
|
file_path.close()
|
||||||
|
|
||||||
|
def is_valid_time(self, input_str):
|
||||||
|
try:
|
||||||
|
valid_time = datetime.strptime(input_str, "%Y-%m-%d") # 根据实际时间格式调整
|
||||||
|
return valid_time
|
||||||
|
except ValueError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def pdf_all_text(self, pdf_path):
|
||||||
|
with pdfplumber.open(pdf_path) as pdf:
|
||||||
|
for page in pdf.pages[1:]:
|
||||||
|
# 提取页面文本
|
||||||
|
text = page.extract_text()
|
||||||
|
lines = text.split()
|
||||||
|
valid_time_list = []
|
||||||
|
for line in lines:
|
||||||
|
if 'SST' in line:
|
||||||
|
self.xlsx_keys['方案编号'] = line
|
||||||
|
|
||||||
|
valid_time = self.is_valid_time(line)
|
||||||
|
if valid_time:
|
||||||
|
valid_time_list.append(valid_time)
|
||||||
|
if valid_time_list:
|
||||||
|
self.xlsx_keys['签发日期'] = max(valid_time_list).strftime("%Y-%m-%d")
|
||||||
|
|
||||||
def pdf_text(self, pdf_path):
|
def pdf_text(self, pdf_path):
|
||||||
with pdfplumber.open(pdf_path) as pdf:
|
with pdfplumber.open(pdf_path) as pdf:
|
||||||
# 遍历每个页面
|
|
||||||
page = pdf.pages[0]
|
page = pdf.pages[0]
|
||||||
# 提取页面文本
|
# 提取页面文本
|
||||||
text = page.extract_text()
|
text = page.extract_text()
|
||||||
lines = text.split("\n")
|
lines = text.split("\n")
|
||||||
line_str = ''
|
line_str = ''
|
||||||
|
company = ''
|
||||||
for line in lines:
|
for line in lines:
|
||||||
line_str += line
|
line_str += line
|
||||||
if '报告编号:' in line:
|
if '报告编号:' in line:
|
||||||
@ -46,12 +72,17 @@ class Discern(object):
|
|||||||
self.xlsx_keys['样品名称'] = line.split("样品名称")[1].strip().replace(': ', '')
|
self.xlsx_keys['样品名称'] = line.split("样品名称")[1].strip().replace(': ', '')
|
||||||
if 'Article Name:' in line:
|
if 'Article Name:' in line:
|
||||||
self.xlsx_keys['样品名称'] = line.split("Article Name")[1].strip().replace(': ', '')
|
self.xlsx_keys['样品名称'] = line.split("Article Name")[1].strip().replace(': ', '')
|
||||||
if '公司' in line and '中检华通' not in line and '制造商' not in line:
|
if '公司' in line:
|
||||||
self.xlsx_keys['公司名称'] = line.strip()
|
company += line.strip()
|
||||||
if '最终报告' in line:
|
if '最终报告' in line:
|
||||||
self.xlsx_keys['检测项目'] = line_str.replace('最终报告', '')
|
self.xlsx_keys['检测项目'] = line_str.replace('最终报告', '')
|
||||||
self.xlsx_keys['标志'] = ''
|
self.xlsx_keys['标志'] = ''
|
||||||
|
|
||||||
|
company_list = company.split(" ")
|
||||||
|
for company_str in company_list:
|
||||||
|
if '中检华通' not in company_str and '制造商' not in company_str:
|
||||||
|
self.xlsx_keys['公司名称'] += company_str
|
||||||
|
|
||||||
def pdf_images(self, pdf_path):
|
def pdf_images(self, pdf_path):
|
||||||
self.num = 0
|
self.num = 0
|
||||||
pdf_reader = PyPDF2.PdfReader(pdf_path)
|
pdf_reader = PyPDF2.PdfReader(pdf_path)
|
||||||
@ -77,7 +108,7 @@ class Discern(object):
|
|||||||
try:
|
try:
|
||||||
text_dict = self.image_text_ocr.run(text_dict, f'../target_img/image_{i}.png')
|
text_dict = self.image_text_ocr.run(text_dict, f'../target_img/image_{i}.png')
|
||||||
except cv2.error as c:
|
except cv2.error as c:
|
||||||
print(c)
|
self.log(c)
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if text_dict.get('标志'):
|
if text_dict.get('标志'):
|
||||||
@ -115,10 +146,13 @@ class Discern(object):
|
|||||||
if entry.is_file():
|
if entry.is_file():
|
||||||
file_path = entry.path
|
file_path = entry.path
|
||||||
file_name = entry.name
|
file_name = entry.name
|
||||||
|
self.log(file_name)
|
||||||
self.xlsx_keys['文件名'] = file_name
|
self.xlsx_keys['文件名'] = file_name
|
||||||
self.pdf_text(file_path)
|
self.pdf_text(file_path)
|
||||||
self.pdf_images(file_path)
|
self.pdf_images(file_path)
|
||||||
self.get_images_text()
|
self.get_images_text()
|
||||||
|
if not self.xlsx_keys['方案编号'] and not self.xlsx_keys['签发日期']:
|
||||||
|
self.pdf_all_text(file_path)
|
||||||
self.xlsx_keys_list.append(self.xlsx_keys)
|
self.xlsx_keys_list.append(self.xlsx_keys)
|
||||||
self.export_excel(self.xlsx_keys_list)
|
self.export_excel(self.xlsx_keys_list)
|
||||||
|
|
||||||
|
@ -1,20 +1,22 @@
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from base import Base
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
import pytesseract
|
import pytesseract
|
||||||
import platform
|
import platform
|
||||||
|
|
||||||
|
|
||||||
class ImageTextOcr(object):
|
class ImageTextOcr(Base):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
super(ImageTextOcr, self).__init__()
|
||||||
current_os = platform.system()
|
current_os = platform.system()
|
||||||
if current_os == 'Windows':
|
if current_os == 'Windows':
|
||||||
pytesseract.pytesseract.tesseract_cmd = r'E:\pc\tesseract-ocr\tesseract.exe'
|
pytesseract.pytesseract.tesseract_cmd = r'E:\pc\tesseract-ocr\tesseract.exe'
|
||||||
print("当前操作系统是 Windows")
|
self.log("当前操作系统是 Windows")
|
||||||
elif current_os == 'Linux':
|
elif current_os == 'Linux':
|
||||||
print("当前操作系统是 Ubuntu")
|
self.log("当前操作系统是 Ubuntu")
|
||||||
else:
|
else:
|
||||||
print(f"当前操作系统是 {current_os}")
|
self.log(f"当前操作系统是 {current_os}")
|
||||||
|
|
||||||
def is_valid_time(self, input_str):
|
def is_valid_time(self, input_str):
|
||||||
try:
|
try:
|
||||||
@ -28,20 +30,18 @@ class ImageTextOcr(object):
|
|||||||
# 加载图像
|
# 加载图像
|
||||||
image = Image.open(path)
|
image = Image.open(path)
|
||||||
result = pytesseract.image_to_string(image, config=r'--oem 3 --psm 6 -l chi_sim+eng')
|
result = pytesseract.image_to_string(image, config=r'--oem 3 --psm 6 -l chi_sim+eng')
|
||||||
lines = result.split('\n')
|
lines = result.split()
|
||||||
for line in lines:
|
for line in lines:
|
||||||
data_list = line.split(' ')
|
if 'S$T' in line or 'SST' in line:
|
||||||
for data in data_list:
|
text_dict['方案编号'] = line.replace('S$T', 'SST').replace('试验方案编号:', '')
|
||||||
if 'S$T' in data or 'SST' in data:
|
|
||||||
text_dict['方案编号'] = data.replace('S$T', 'SST')
|
|
||||||
|
|
||||||
if 'CNAS' in data:
|
if 'CNAS' in line:
|
||||||
text_dict['标志'] = 'cnas中文,'
|
text_dict['标志'] = 'cnas中文,'
|
||||||
|
|
||||||
if '200015344424' in data:
|
if '200015344424' in line:
|
||||||
text_dict['标志'] = '国cma,'
|
text_dict['标志'] = '国cma,'
|
||||||
|
|
||||||
valid_time = self.is_valid_time(data)
|
valid_time = self.is_valid_time(line)
|
||||||
if valid_time:
|
if valid_time:
|
||||||
valid_time_list.append(valid_time)
|
valid_time_list.append(valid_time)
|
||||||
|
|
||||||
@ -61,5 +61,5 @@ if __name__ == '__main__':
|
|||||||
'签发日期': '',
|
'签发日期': '',
|
||||||
'标志': ''
|
'标志': ''
|
||||||
}
|
}
|
||||||
res = image_text_ocr.run(text_dict, '../target_img/image_3.png')
|
res = image_text_ocr.run(text_dict, '../target_img/image_5.png')
|
||||||
print(res)
|
print(res)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user