mirror of
https://github.com/luzhisheng/js_reverse.git
synced 2025-04-22 18:47:51 +08:00
pdf 处理
This commit is contained in:
parent
c7d2099d3b
commit
9210226e28
@ -0,0 +1,4 @@
|
|||||||
|
from program.discern import Discern
|
||||||
|
|
||||||
|
discern = Discern()
|
||||||
|
discern.run('./file_test')
|
@ -1,10 +1,11 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
from PIL import Image
|
||||||
import pdfplumber
|
import pdfplumber
|
||||||
import re
|
import re
|
||||||
import PyPDF2
|
import PyPDF2
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from base import Base
|
from base import Base
|
||||||
from image_text_ocr import ImageTextOcr
|
from program.image_text_ocr import ImageTextOcr
|
||||||
import os
|
import os
|
||||||
import cv2
|
import cv2
|
||||||
|
|
||||||
@ -23,7 +24,7 @@ class Discern(Base):
|
|||||||
pf = pd.DataFrame(list(export))
|
pf = pd.DataFrame(list(export))
|
||||||
current_time = datetime.now()
|
current_time = datetime.now()
|
||||||
formatted_time = current_time.strftime('%Y-%m-%d-%H-%M-%S')
|
formatted_time = current_time.strftime('%Y-%m-%d-%H-%M-%S')
|
||||||
file_path = pd.ExcelWriter(f'../docs/无源{formatted_time}.xlsx')
|
file_path = pd.ExcelWriter(f'./docs/无源{formatted_time}.xlsx')
|
||||||
# 替换空单元格
|
# 替换空单元格
|
||||||
pf.fillna(' ', inplace=True)
|
pf.fillna(' ', inplace=True)
|
||||||
# 输出
|
# 输出
|
||||||
@ -87,15 +88,28 @@ class Discern(Base):
|
|||||||
if '公司' in line:
|
if '公司' in line:
|
||||||
company += line.strip()
|
company += line.strip()
|
||||||
if '最终报告' in line or 'Final Report' in line:
|
if '最终报告' in line or 'Final Report' in line:
|
||||||
self.xlsx_keys['检测项目'] = line_str.replace('最终报告', '').replace('Final Report', '')\
|
self.xlsx_keys['检测项目'] = line_str.replace('最终报告', '').replace('Final Report', '') \
|
||||||
.replace('中国认可国际互认检测', '')
|
.replace('中国认可国际互认检测', '')
|
||||||
|
|
||||||
company_list = company.split()
|
company_list = company.split()
|
||||||
for company_str in company_list:
|
for company_str in company_list:
|
||||||
self.xlsx_keys['公司名称'] += company_str
|
self.xlsx_keys['公司名称'] += company_str
|
||||||
self.xlsx_keys['公司名称'] = self.xlsx_keys['公司名称'].replace('中检华通威国际检验(苏州)有限公司', '').\
|
self.xlsx_keys['公司名称'] = self.xlsx_keys['公司名称'].replace('中检华通威国际检验(苏州)有限公司', ''). \
|
||||||
replace('中检华通威国际检验(苏州)有限公司', '')
|
replace('中检华通威国际检验(苏州)有限公司', '')
|
||||||
|
|
||||||
|
def processing_image(self, img_file, standard=205):
|
||||||
|
""" 1.将图片进行降噪处理, 通过二值化去掉后面的背景色并加深文字对比度 """
|
||||||
|
img = Image.open(img_file)
|
||||||
|
img = img.convert('L')
|
||||||
|
pixels = img.load()
|
||||||
|
for x in range(img.width):
|
||||||
|
for y in range(img.height):
|
||||||
|
if pixels[x, y] > standard:
|
||||||
|
pixels[x, y] = 255
|
||||||
|
else:
|
||||||
|
pixels[x, y] = 0
|
||||||
|
img.save(img_file)
|
||||||
|
|
||||||
def pdf_images(self, pdf_path):
|
def pdf_images(self, pdf_path):
|
||||||
self.num = 0
|
self.num = 0
|
||||||
pdf_reader = PyPDF2.PdfReader(pdf_path)
|
pdf_reader = PyPDF2.PdfReader(pdf_path)
|
||||||
@ -105,11 +119,12 @@ class Discern(Base):
|
|||||||
|
|
||||||
for obj in xObject:
|
for obj in xObject:
|
||||||
if xObject[obj]['/Subtype'] == '/Image':
|
if xObject[obj]['/Subtype'] == '/Image':
|
||||||
size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
|
|
||||||
self.num += 1
|
self.num += 1
|
||||||
image_file = f"../target_img/image_{self.num}.png"
|
image_file = f"./target_img/image_{self.num}.png"
|
||||||
with open(image_file, "wb") as f:
|
with open(image_file, "wb") as f:
|
||||||
f.write(xObject[obj].get_data())
|
f.write(xObject[obj].get_data())
|
||||||
|
if page_num != 0:
|
||||||
|
self.processing_image(image_file)
|
||||||
|
|
||||||
def get_images_text(self):
|
def get_images_text(self):
|
||||||
for i in range(1, self.num + 1):
|
for i in range(1, self.num + 1):
|
||||||
@ -119,7 +134,7 @@ class Discern(Base):
|
|||||||
'标志': ''
|
'标志': ''
|
||||||
}
|
}
|
||||||
try:
|
try:
|
||||||
text_dict = self.image_text_ocr.run(text_dict, f'../target_img/image_{i}.png')
|
text_dict = self.image_text_ocr.run(text_dict, f'./target_img/image_{i}.png')
|
||||||
except cv2.error as c:
|
except cv2.error as c:
|
||||||
self.log(c)
|
self.log(c)
|
||||||
pass
|
pass
|
||||||
@ -155,7 +170,7 @@ class Discern(Base):
|
|||||||
'签发日期': '',
|
'签发日期': '',
|
||||||
'公司名称': ''
|
'公司名称': ''
|
||||||
}
|
}
|
||||||
self.remove_file('../target_img')
|
self.remove_file('./target_img')
|
||||||
if entry.is_file():
|
if entry.is_file():
|
||||||
file_path = entry.path
|
file_path = entry.path
|
||||||
file_name = entry.name
|
file_name = entry.name
|
||||||
@ -177,4 +192,4 @@ class Discern(Base):
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
discern = Discern()
|
discern = Discern()
|
||||||
discern.run('../file')
|
discern.run('./file_test')
|
||||||
|
@ -61,5 +61,5 @@ if __name__ == '__main__':
|
|||||||
'签发日期': '',
|
'签发日期': '',
|
||||||
'标志': ''
|
'标志': ''
|
||||||
}
|
}
|
||||||
res = image_text_ocr.run(text_dict, '../target_img/image_5.png')
|
res = image_text_ocr.run(text_dict, '../target_img/image_2.jpg')
|
||||||
print(res)
|
print(res)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user