pdf 处理

This commit is contained in:
aiyingfeng 2023-07-28 14:45:01 +08:00
parent c7d2099d3b
commit 9210226e28
3 changed files with 29 additions and 10 deletions

View File

@ -0,0 +1,4 @@
from program.discern import Discern
discern = Discern()
discern.run('./file_test')

View File

@ -1,10 +1,11 @@
import pandas as pd import pandas as pd
from PIL import Image
import pdfplumber import pdfplumber
import re import re
import PyPDF2 import PyPDF2
from datetime import datetime from datetime import datetime
from base import Base from base import Base
from image_text_ocr import ImageTextOcr from program.image_text_ocr import ImageTextOcr
import os import os
import cv2 import cv2
@ -23,7 +24,7 @@ class Discern(Base):
pf = pd.DataFrame(list(export)) pf = pd.DataFrame(list(export))
current_time = datetime.now() current_time = datetime.now()
formatted_time = current_time.strftime('%Y-%m-%d-%H-%M-%S') formatted_time = current_time.strftime('%Y-%m-%d-%H-%M-%S')
file_path = pd.ExcelWriter(f'../docs/无源{formatted_time}.xlsx') file_path = pd.ExcelWriter(f'./docs/无源{formatted_time}.xlsx')
# 替换空单元格 # 替换空单元格
pf.fillna(' ', inplace=True) pf.fillna(' ', inplace=True)
# 输出 # 输出
@ -87,15 +88,28 @@ class Discern(Base):
if '公司' in line: if '公司' in line:
company += line.strip() company += line.strip()
if '最终报告' in line or 'Final Report' in line: if '最终报告' in line or 'Final Report' in line:
self.xlsx_keys['检测项目'] = line_str.replace('最终报告', '').replace('Final Report', '')\ self.xlsx_keys['检测项目'] = line_str.replace('最终报告', '').replace('Final Report', '') \
.replace('中国认可国际互认检测', '') .replace('中国认可国际互认检测', '')
company_list = company.split() company_list = company.split()
for company_str in company_list: for company_str in company_list:
self.xlsx_keys['公司名称'] += company_str self.xlsx_keys['公司名称'] += company_str
self.xlsx_keys['公司名称'] = self.xlsx_keys['公司名称'].replace('中检华通威国际检验(苏州)有限公司', '').\ self.xlsx_keys['公司名称'] = self.xlsx_keys['公司名称'].replace('中检华通威国际检验(苏州)有限公司', ''). \
replace('中检华通威国际检验(苏州)有限公司', '') replace('中检华通威国际检验(苏州)有限公司', '')
def processing_image(self, img_file, standard=205):
""" 1.将图片进行降噪处理, 通过二值化去掉后面的背景色并加深文字对比度 """
img = Image.open(img_file)
img = img.convert('L')
pixels = img.load()
for x in range(img.width):
for y in range(img.height):
if pixels[x, y] > standard:
pixels[x, y] = 255
else:
pixels[x, y] = 0
img.save(img_file)
def pdf_images(self, pdf_path): def pdf_images(self, pdf_path):
self.num = 0 self.num = 0
pdf_reader = PyPDF2.PdfReader(pdf_path) pdf_reader = PyPDF2.PdfReader(pdf_path)
@ -105,11 +119,12 @@ class Discern(Base):
for obj in xObject: for obj in xObject:
if xObject[obj]['/Subtype'] == '/Image': if xObject[obj]['/Subtype'] == '/Image':
size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
self.num += 1 self.num += 1
image_file = f"../target_img/image_{self.num}.png" image_file = f"./target_img/image_{self.num}.png"
with open(image_file, "wb") as f: with open(image_file, "wb") as f:
f.write(xObject[obj].get_data()) f.write(xObject[obj].get_data())
if page_num != 0:
self.processing_image(image_file)
def get_images_text(self): def get_images_text(self):
for i in range(1, self.num + 1): for i in range(1, self.num + 1):
@ -119,7 +134,7 @@ class Discern(Base):
'标志': '' '标志': ''
} }
try: try:
text_dict = self.image_text_ocr.run(text_dict, f'../target_img/image_{i}.png') text_dict = self.image_text_ocr.run(text_dict, f'./target_img/image_{i}.png')
except cv2.error as c: except cv2.error as c:
self.log(c) self.log(c)
pass pass
@ -155,7 +170,7 @@ class Discern(Base):
'签发日期': '', '签发日期': '',
'公司名称': '' '公司名称': ''
} }
self.remove_file('../target_img') self.remove_file('./target_img')
if entry.is_file(): if entry.is_file():
file_path = entry.path file_path = entry.path
file_name = entry.name file_name = entry.name
@ -177,4 +192,4 @@ class Discern(Base):
if __name__ == '__main__': if __name__ == '__main__':
discern = Discern() discern = Discern()
discern.run('../file') discern.run('./file_test')

View File

@ -61,5 +61,5 @@ if __name__ == '__main__':
'签发日期': '', '签发日期': '',
'标志': '' '标志': ''
} }
res = image_text_ocr.run(text_dict, '../target_img/image_5.png') res = image_text_ocr.run(text_dict, '../target_img/image_2.jpg')
print(res) print(res)