pdf 处理

This commit is contained in:
aiyingfeng 2023-07-28 14:45:01 +08:00
parent c7d2099d3b
commit 9210226e28
3 changed files with 29 additions and 10 deletions

View File

@ -0,0 +1,4 @@
from program.discern import Discern
discern = Discern()
discern.run('./file_test')

View File

@ -1,10 +1,11 @@
import pandas as pd
from PIL import Image
import pdfplumber
import re
import PyPDF2
from datetime import datetime
from base import Base
from image_text_ocr import ImageTextOcr
from program.image_text_ocr import ImageTextOcr
import os
import cv2
@ -23,7 +24,7 @@ class Discern(Base):
pf = pd.DataFrame(list(export))
current_time = datetime.now()
formatted_time = current_time.strftime('%Y-%m-%d-%H-%M-%S')
file_path = pd.ExcelWriter(f'../docs/无源{formatted_time}.xlsx')
file_path = pd.ExcelWriter(f'./docs/无源{formatted_time}.xlsx')
# 替换空单元格
pf.fillna(' ', inplace=True)
# 输出
@ -87,15 +88,28 @@ class Discern(Base):
if '公司' in line:
company += line.strip()
if '最终报告' in line or 'Final Report' in line:
self.xlsx_keys['检测项目'] = line_str.replace('最终报告', '').replace('Final Report', '')\
self.xlsx_keys['检测项目'] = line_str.replace('最终报告', '').replace('Final Report', '') \
.replace('中国认可国际互认检测', '')
company_list = company.split()
for company_str in company_list:
self.xlsx_keys['公司名称'] += company_str
self.xlsx_keys['公司名称'] = self.xlsx_keys['公司名称'].replace('中检华通威国际检验(苏州)有限公司', '').\
self.xlsx_keys['公司名称'] = self.xlsx_keys['公司名称'].replace('中检华通威国际检验(苏州)有限公司', ''). \
replace('中检华通威国际检验(苏州)有限公司', '')
def processing_image(self, img_file, standard=205):
""" 1.将图片进行降噪处理, 通过二值化去掉后面的背景色并加深文字对比度 """
img = Image.open(img_file)
img = img.convert('L')
pixels = img.load()
for x in range(img.width):
for y in range(img.height):
if pixels[x, y] > standard:
pixels[x, y] = 255
else:
pixels[x, y] = 0
img.save(img_file)
def pdf_images(self, pdf_path):
self.num = 0
pdf_reader = PyPDF2.PdfReader(pdf_path)
@ -105,11 +119,12 @@ class Discern(Base):
for obj in xObject:
if xObject[obj]['/Subtype'] == '/Image':
size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
self.num += 1
image_file = f"../target_img/image_{self.num}.png"
image_file = f"./target_img/image_{self.num}.png"
with open(image_file, "wb") as f:
f.write(xObject[obj].get_data())
if page_num != 0:
self.processing_image(image_file)
def get_images_text(self):
for i in range(1, self.num + 1):
@ -119,7 +134,7 @@ class Discern(Base):
'标志': ''
}
try:
text_dict = self.image_text_ocr.run(text_dict, f'../target_img/image_{i}.png')
text_dict = self.image_text_ocr.run(text_dict, f'./target_img/image_{i}.png')
except cv2.error as c:
self.log(c)
pass
@ -155,7 +170,7 @@ class Discern(Base):
'签发日期': '',
'公司名称': ''
}
self.remove_file('../target_img')
self.remove_file('./target_img')
if entry.is_file():
file_path = entry.path
file_name = entry.name
@ -177,4 +192,4 @@ class Discern(Base):
if __name__ == '__main__':
discern = Discern()
discern.run('../file')
discern.run('./file_test')

View File

@ -61,5 +61,5 @@ if __name__ == '__main__':
'签发日期': '',
'标志': ''
}
res = image_text_ocr.run(text_dict, '../target_img/image_5.png')
res = image_text_ocr.run(text_dict, '../target_img/image_2.jpg')
print(res)