提取pdf文件中图片和文字

2025-04-22 08:11:29 +08:00 · 2023-08-02 16:17:26 +08:00 · 2023-08-02 16:17:26 +08:00 · 97fccbd604
commit 97fccbd604
parent 2dc7cb4a6d
2 changed files with 4 additions and 3 deletions
--- a/pdf处理/program/pdf_base.py
+++ b/pdf处理/program/pdf_base.py
@ -1,5 +1,5 @@
 from datetime import datetime
-from extract_from_pages import read_pdf
+from program.extract_from_pages import read_pdf
 from base import Base
 from PIL import Image
 import pandas as pd
--- a/pdf处理/program/testing_agency_report.py
+++ b/pdf处理/program/testing_agency_report.py
@ -29,7 +29,8 @@ class TestingAgencyReport(PDFBase):
                        valid_time_list = []
                        for line in lines:
                            if 'S$T' in line or 'SST' in line:
-                                text_dict['方案编号'] = line.replace('S$T', 'SST').replace('试验方案编号:', '')
+                                text_dict['方案编号'] = line.replace('S$T', 'SST')\
+                                    .replace('试验方案编号:', '').replace('$', '')
                            if 'CNAS' in line:
                                text_dict['标志'] = 'cnas中文,'
                            valid_time = self.is_valid_time(line)
@ -65,7 +66,7 @@ class TestingAgencyReport(PDFBase):
                if 'CSTBB' in line:
                    for li in line.split():
                        if 'CSTBB' in li:
-                            self.xlsx_keys['报告编号'] = li.strip().replace('报告编号：', '')
+                            self.xlsx_keys['报告编号'] = li.strip().replace('报告编号：', '').replace('）', '')
                if '样品名称' in line:
                    try:
                        self.xlsx_keys['样品名称'] = line.split()[1].strip().replace(': ', '')