提取pdf文件中图片和文字

This commit is contained in:
aiyingfeng 2023-08-02 16:17:26 +08:00
parent 2dc7cb4a6d
commit 97fccbd604
2 changed files with 4 additions and 3 deletions

View File

@ -1,5 +1,5 @@
from datetime import datetime
from extract_from_pages import read_pdf
from program.extract_from_pages import read_pdf
from base import Base
from PIL import Image
import pandas as pd

View File

@ -29,7 +29,8 @@ class TestingAgencyReport(PDFBase):
valid_time_list = []
for line in lines:
if 'S$T' in line or 'SST' in line:
text_dict['方案编号'] = line.replace('S$T', 'SST').replace('试验方案编号:', '')
text_dict['方案编号'] = line.replace('S$T', 'SST')\
.replace('试验方案编号:', '').replace('$', '')
if 'CNAS' in line:
text_dict['标志'] = 'cnas中文,'
valid_time = self.is_valid_time(line)
@ -65,7 +66,7 @@ class TestingAgencyReport(PDFBase):
if 'CSTBB' in line:
for li in line.split():
if 'CSTBB' in li:
self.xlsx_keys['报告编号'] = li.strip().replace('报告编号:', '')
self.xlsx_keys['报告编号'] = li.strip().replace('报告编号:', '').replace('', '')
if '样品名称' in line:
try:
self.xlsx_keys['样品名称'] = line.split()[1].strip().replace(': ', '')