js_reverse/pdf处理/program/discern.py

import pdfplumber


class Discern(object):

    def __init__(self):
        self.xlsx_table = {}

    def pdf_text(self, path_pdf):
        with pdfplumber.open(path_pdf) as pdf:
            # 遍历每个页面
            for page in pdf.pages:
                # 提取页面文本
                text = page.extract_text()
                lines = text.split("\n")
                print(lines)
                for line in lines:
                    # print(line)
                    if '报告编号:' in line:
                        self.xlsx_table['报告编号'] = line.split("报告编号")[1].strip()
                    if '样品名称:' in line:
                        self.xlsx_table['样品名称'] = line.split("样品名称")[1].strip()
                    if '公司' in line and '中检华通' not in line:
                        print(line)
                        self.xlsx_table['公司名称'] = line.strip()
            print(self.xlsx_table)

    def run(self):
        self.pdf_text('../docs/CSTBB23040149+SST2304000301BB+英姿医疗科技(湖南)有限公司+宫腔内窥镜+低温等离子灭菌+耐受性试验（中文）.pdf')


if __name__ == '__main__':
    discern = Discern()
    discern.run()