mirror of
https://github.com/luzhisheng/js_reverse.git
synced 2025-04-20 03:59:57 +08:00
提取pdf文件中图片和文字
This commit is contained in:
parent
749cf70dcd
commit
b13155d7f2
@ -1,4 +1,4 @@
|
||||
from program.discern import Discern
|
||||
from program.testing_agency_report import TestingAgencyReport
|
||||
|
||||
discern = Discern()
|
||||
discern.run('./file_test')
|
||||
testing_agency_report = TestingAgencyReport()
|
||||
testing_agency_report.run('./file', './target_img', './docs')
|
||||
|
@ -1,65 +0,0 @@
|
||||
from datetime import datetime
|
||||
from base import Base
|
||||
from PIL import Image
|
||||
import pytesseract
|
||||
import platform
|
||||
|
||||
|
||||
class ImageTextOcr(Base):
|
||||
|
||||
def __init__(self):
|
||||
super(ImageTextOcr, self).__init__()
|
||||
current_os = platform.system()
|
||||
if current_os == 'Windows':
|
||||
pytesseract.pytesseract.tesseract_cmd = r'E:\pc\tesseract-ocr\tesseract.exe'
|
||||
self.log("当前操作系统是 Windows")
|
||||
elif current_os == 'Linux':
|
||||
self.log("当前操作系统是 Ubuntu")
|
||||
else:
|
||||
self.log(f"当前操作系统是 {current_os}")
|
||||
|
||||
def is_valid_time(self, input_str):
|
||||
try:
|
||||
valid_time = datetime.strptime(input_str, "%Y-%m-%d") # 根据实际时间格式调整
|
||||
return valid_time
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
def image_text_ocr(self, text_dict, path):
|
||||
valid_time_list = []
|
||||
# 加载图像
|
||||
image = Image.open(path)
|
||||
result = pytesseract.image_to_string(image, config=r'--oem 3 --psm 6 -l chi_sim+eng')
|
||||
lines = result.split()
|
||||
for line in lines:
|
||||
if 'S$T' in line or 'SST' in line:
|
||||
text_dict['方案编号'] = line.replace('S$T', 'SST').replace('试验方案编号:', '')
|
||||
|
||||
if 'CNAS' in line:
|
||||
text_dict['标志'] = 'cnas中文,'
|
||||
|
||||
if '200015344424' in line:
|
||||
text_dict['标志'] = '国cma,'
|
||||
|
||||
valid_time = self.is_valid_time(line)
|
||||
if valid_time:
|
||||
valid_time_list.append(valid_time)
|
||||
|
||||
if valid_time_list:
|
||||
text_dict['签发日期'] = max(valid_time_list).strftime("%Y-%m-%d")
|
||||
return text_dict
|
||||
|
||||
def run(self, text_dict, path):
|
||||
res_list = self.image_text_ocr(text_dict, path)
|
||||
return res_list
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
image_text_ocr = ImageTextOcr()
|
||||
text_dict = {
|
||||
'方案编号': '',
|
||||
'签发日期': '',
|
||||
'标志': ''
|
||||
}
|
||||
res = image_text_ocr.run(text_dict, '../target_img/output_image-003.png')
|
||||
print(res)
|
101
pdf处理/program/pdf_base.py
Normal file
101
pdf处理/program/pdf_base.py
Normal file
@ -0,0 +1,101 @@
|
||||
from datetime import datetime
|
||||
from base import Base
|
||||
import subprocess
|
||||
from PIL import Image
|
||||
import pandas as pd
|
||||
import pytesseract
|
||||
import platform
|
||||
import os
|
||||
|
||||
|
||||
class PDFBase(Base):
|
||||
|
||||
def __init__(self):
|
||||
super(Base, self).__init__()
|
||||
current_os = platform.system()
|
||||
if current_os == 'Windows':
|
||||
pytesseract.pytesseract.tesseract_cmd = r'E:\pc\tesseract-ocr\tesseract.exe'
|
||||
self.log("当前操作系统是 Windows")
|
||||
elif current_os == 'Linux':
|
||||
self.log("当前操作系统是 Ubuntu")
|
||||
else:
|
||||
self.log(f"当前操作系统是 {current_os}")
|
||||
|
||||
def download_img(self, input_pdf, output_image):
|
||||
"""
|
||||
下载pdf中全部图片
|
||||
:param input_pdf:
|
||||
:param output_image:
|
||||
:return:
|
||||
"""
|
||||
try:
|
||||
subprocess.run(f"pdfimages -png '{input_pdf}' {output_image}/image", shell=True,
|
||||
capture_output=True, text=True)
|
||||
except Exception as e:
|
||||
self.log(f"出现异常:{e}")
|
||||
|
||||
@staticmethod
|
||||
def read_img_ocr(img_path):
|
||||
"""
|
||||
读取图片中文字内容
|
||||
:param img_path:
|
||||
:return:
|
||||
"""
|
||||
image = Image.open(img_path)
|
||||
result = pytesseract.image_to_string(image, config=r'--oem 3 --psm 6 -l chi_sim+eng')
|
||||
lines = result.split()
|
||||
return lines
|
||||
|
||||
def remove_img(self, img_path):
|
||||
"""
|
||||
删除当前文件夹下所有的图片
|
||||
:param img_path:
|
||||
:return:
|
||||
"""
|
||||
with os.scandir(img_path) as entries:
|
||||
for entry in entries:
|
||||
if entry.is_file():
|
||||
file_path = entry.path
|
||||
try:
|
||||
os.remove(file_path)
|
||||
except Exception as e:
|
||||
self.log(f"错误信息:{e}")
|
||||
|
||||
@staticmethod
|
||||
def is_valid_time(input_str):
|
||||
"""
|
||||
判断是否是时间格式
|
||||
:param input_str:
|
||||
:return:
|
||||
"""
|
||||
try:
|
||||
valid_time = datetime.strptime(input_str, "%Y-%m-%d")
|
||||
return valid_time
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def export_excel(export, excel_path):
|
||||
"""
|
||||
将字典列表转换为DataFrame
|
||||
:param export:
|
||||
:return:
|
||||
"""
|
||||
pf = pd.DataFrame(list(export))
|
||||
current_time = datetime.now()
|
||||
formatted_time = current_time.strftime('%Y-%m-%d-%H-%M-%S')
|
||||
file_path = pd.ExcelWriter(f'{excel_path}/无源{formatted_time}.xlsx')
|
||||
# 替换空单元格
|
||||
pf.fillna(' ', inplace=True)
|
||||
# 输出
|
||||
pf = pf.sort_values(by='样品名称')
|
||||
pf.to_excel(file_path, index=False)
|
||||
# 保存表格
|
||||
file_path.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pdf_base = PDFBase()
|
||||
# pdf_base.download_img('../file_test/1.pdf', '../target_img/')
|
||||
res = pdf_base.read_img_ocr('../target_img/image-017.png')
|
||||
print(res)
|
@ -1,67 +1,56 @@
|
||||
import pandas as pd
|
||||
from PIL import Image
|
||||
from program.pdf_base import PDFBase
|
||||
import pdfplumber
|
||||
import re
|
||||
import PyPDF2
|
||||
from datetime import datetime
|
||||
from base import Base
|
||||
from program.image_text_ocr import ImageTextOcr
|
||||
import os
|
||||
import cv2
|
||||
import os
|
||||
import re
|
||||
|
||||
|
||||
class Discern(Base):
|
||||
class TestingAgencyReport(PDFBase):
|
||||
|
||||
def __init__(self):
|
||||
super(Discern, self).__init__()
|
||||
self.image_text_ocr = ImageTextOcr()
|
||||
super(TestingAgencyReport, self).__init__()
|
||||
self.xlsx_keys = {}
|
||||
self.xlsx_keys_list = []
|
||||
self.num = 0
|
||||
|
||||
def export_excel(self, export):
|
||||
# 将字典列表转换为DataFrame
|
||||
pf = pd.DataFrame(list(export))
|
||||
current_time = datetime.now()
|
||||
formatted_time = current_time.strftime('%Y-%m-%d-%H-%M-%S')
|
||||
file_path = pd.ExcelWriter(f'./docs/无源{formatted_time}.xlsx')
|
||||
# 替换空单元格
|
||||
pf.fillna(' ', inplace=True)
|
||||
# 输出
|
||||
pf = pf.sort_values(by='样品名称')
|
||||
pf.to_excel(file_path, index=False)
|
||||
# 保存表格
|
||||
file_path.close()
|
||||
def pdf_images(self, pdf_path, img_path):
|
||||
self.download_img(pdf_path, img_path)
|
||||
with os.scandir(img_path) as entries:
|
||||
for entry in entries:
|
||||
if entry.is_file():
|
||||
text_dict = {
|
||||
'方案编号': '',
|
||||
'签发日期': '',
|
||||
'标志': ''
|
||||
}
|
||||
|
||||
def is_valid_time(self, input_str):
|
||||
try:
|
||||
valid_time = datetime.strptime(input_str, "%Y-%m-%d") # 根据实际时间格式调整
|
||||
return valid_time
|
||||
except ValueError:
|
||||
return False
|
||||
try:
|
||||
lines = self.read_img_ocr(entry.path)
|
||||
valid_time_list = []
|
||||
for line in lines:
|
||||
if 'S$T' in line or 'SST' in line:
|
||||
text_dict['方案编号'] = line.replace('S$T', 'SST').replace('试验方案编号:', '')
|
||||
|
||||
def pdf_all_text(self, pdf_path):
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
for page in pdf.pages[1:]:
|
||||
# 提取页面文本
|
||||
text = page.extract_text()
|
||||
lines = text.split()
|
||||
valid_time_list = []
|
||||
for line in lines:
|
||||
if 'SST' in line and not self.xlsx_keys['方案编号']:
|
||||
self.xlsx_keys['方案编号'] = line
|
||||
if 'CNAS' in line:
|
||||
text_dict['标志'] = 'cnas中文,'
|
||||
|
||||
if '签发日期' in line and not self.xlsx_keys['签发日期']:
|
||||
self.xlsx_keys['签发日期'] = line.replace('签发日期', '')
|
||||
if '200015344424' in line:
|
||||
text_dict['标志'] = '国cma,'
|
||||
|
||||
if 'GLP' in line:
|
||||
self.xlsx_keys['标志'] += 'GLP,'
|
||||
valid_time = self.is_valid_time(line)
|
||||
if valid_time:
|
||||
valid_time_list.append(valid_time)
|
||||
if valid_time_list:
|
||||
text_dict['签发日期'] = max(valid_time_list).strftime("%Y-%m-%d")
|
||||
except cv2.error as c:
|
||||
self.log(c)
|
||||
|
||||
valid_time = self.is_valid_time(line)
|
||||
if valid_time:
|
||||
valid_time_list.append(valid_time)
|
||||
if valid_time_list:
|
||||
self.xlsx_keys['签发日期'] = max(valid_time_list).strftime("%Y-%m-%d")
|
||||
if text_dict.get('标志'):
|
||||
self.xlsx_keys['标志'] += text_dict.get('标志')
|
||||
if text_dict.get('方案编号'):
|
||||
self.xlsx_keys['方案编号'] = text_dict.get('方案编号')
|
||||
if text_dict.get('签发日期'):
|
||||
self.xlsx_keys['签发日期'] = text_dict.get('签发日期')
|
||||
|
||||
def pdf_text(self, pdf_path):
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
@ -97,67 +86,32 @@ class Discern(Base):
|
||||
self.xlsx_keys['公司名称'] = self.xlsx_keys['公司名称'].replace('中检华通威国际检验(苏州)有限公司', ''). \
|
||||
replace('中检华通威国际检验(苏州)有限公司', '')
|
||||
|
||||
def processing_image(self, img_file, standard=205):
|
||||
""" 1.将图片进行降噪处理, 通过二值化去掉后面的背景色并加深文字对比度 """
|
||||
img = Image.open(img_file)
|
||||
img = img.convert('L')
|
||||
pixels = img.load()
|
||||
for x in range(img.width):
|
||||
for y in range(img.height):
|
||||
if pixels[x, y] > standard:
|
||||
pixels[x, y] = 255
|
||||
else:
|
||||
pixels[x, y] = 0
|
||||
img.save(img_file)
|
||||
def pdf_all_text(self, pdf_path):
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
for page in pdf.pages[1:]:
|
||||
# 提取页面文本
|
||||
text = page.extract_text()
|
||||
lines = text.split()
|
||||
valid_time_list = []
|
||||
for line in lines:
|
||||
|
||||
def pdf_images(self, pdf_path):
|
||||
self.num = 0
|
||||
pdf_reader = PyPDF2.PdfReader(pdf_path)
|
||||
for page_num in range(len(pdf_reader.pages)):
|
||||
page = pdf_reader.pages[page_num]
|
||||
xObject = page['/Resources']['/XObject'].get_object()
|
||||
if 'SST' in line and not self.xlsx_keys['方案编号']:
|
||||
self.xlsx_keys['方案编号'] = line
|
||||
|
||||
for obj in xObject:
|
||||
if xObject[obj]['/Subtype'] == '/Image':
|
||||
self.num += 1
|
||||
image_file = f"./target_img/image_{self.num}.png"
|
||||
with open(image_file, "wb") as f:
|
||||
f.write(xObject[obj].get_data())
|
||||
if page_num != 0:
|
||||
self.processing_image(image_file)
|
||||
if '签发日期' in line and not self.xlsx_keys['签发日期']:
|
||||
self.xlsx_keys['签发日期'] = line.replace('签发日期', '')
|
||||
|
||||
def get_images_text(self):
|
||||
for i in range(1, self.num + 1):
|
||||
text_dict = {
|
||||
'方案编号': '',
|
||||
'签发日期': '',
|
||||
'标志': ''
|
||||
}
|
||||
try:
|
||||
text_dict = self.image_text_ocr.run(text_dict, f'./target_img/image_{i}.png')
|
||||
except cv2.error as c:
|
||||
self.log(c)
|
||||
pass
|
||||
if 'GLP' in line:
|
||||
self.xlsx_keys['标志'] += 'GLP,'
|
||||
|
||||
if text_dict.get('标志'):
|
||||
self.xlsx_keys['标志'] += text_dict.get('标志')
|
||||
if text_dict.get('方案编号'):
|
||||
self.xlsx_keys['方案编号'] = text_dict.get('方案编号')
|
||||
if text_dict.get('签发日期'):
|
||||
self.xlsx_keys['签发日期'] = text_dict.get('签发日期')
|
||||
valid_time = self.is_valid_time(line)
|
||||
if valid_time:
|
||||
valid_time_list.append(valid_time)
|
||||
if valid_time_list:
|
||||
self.xlsx_keys['签发日期'] = max(valid_time_list).strftime("%Y-%m-%d")
|
||||
|
||||
def remove_file(self, folder_path):
|
||||
with os.scandir(folder_path) as entries:
|
||||
for entry in entries:
|
||||
if entry.is_file():
|
||||
file_path = entry.path
|
||||
try:
|
||||
os.remove(file_path)
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
def run(self, folder_path):
|
||||
with os.scandir(folder_path) as entries:
|
||||
def discern(self, pdf_path, img_path, excel_path):
|
||||
with os.scandir(pdf_path) as entries:
|
||||
for entry in entries:
|
||||
self.xlsx_keys = {
|
||||
'登记日期': '',
|
||||
@ -170,26 +124,30 @@ class Discern(Base):
|
||||
'签发日期': '',
|
||||
'公司名称': ''
|
||||
}
|
||||
self.remove_file('./target_img')
|
||||
self.remove_img(img_path)
|
||||
if entry.is_file():
|
||||
file_path = entry.path
|
||||
pdf_path = entry.path
|
||||
file_name = entry.name
|
||||
self.log(file_name)
|
||||
self.xlsx_keys['文件名'] = file_name
|
||||
self.pdf_text(file_path)
|
||||
self.pdf_images(file_path)
|
||||
self.get_images_text()
|
||||
self.pdf_all_text(file_path)
|
||||
self.pdf_text(pdf_path)
|
||||
self.pdf_images(pdf_path, img_path)
|
||||
|
||||
self.pdf_all_text(pdf_path)
|
||||
if not self.xlsx_keys['方案编号']:
|
||||
matches = re.findall(r'SST\d+BB', file_name)
|
||||
if matches:
|
||||
self.xlsx_keys['方案编号'] = matches[0]
|
||||
else:
|
||||
self.log("未找到匹配的模式方案编号")
|
||||
|
||||
self.xlsx_keys_list.append(self.xlsx_keys)
|
||||
self.export_excel(self.xlsx_keys_list)
|
||||
self.export_excel(self.xlsx_keys_list, excel_path)
|
||||
|
||||
def run(self, pdf_path, img_path, excel_path):
|
||||
self.discern(pdf_path, img_path, excel_path)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
discern = Discern()
|
||||
discern.run('./file_test')
|
||||
testing_agency_report = TestingAgencyReport()
|
||||
testing_agency_report.run('../file_test', '../target_img', '../docs')
|
Loading…
x
Reference in New Issue
Block a user