diff --git a/pdf处理/main.py b/pdf处理/main.py new file mode 100644 index 0000000..e69de29 diff --git a/pdf处理/program/__init__.py b/pdf处理/program/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pdf处理/program/discern.py b/pdf处理/program/discern.py new file mode 100644 index 0000000..9a42dc7 --- /dev/null +++ b/pdf处理/program/discern.py @@ -0,0 +1,34 @@ +import pdfplumber + + +class Discern(object): + + def __init__(self): + self.xlsx_table = {} + + def pdf_text(self, path_pdf): + with pdfplumber.open(path_pdf) as pdf: + # 遍历每个页面 + for page in pdf.pages: + # 提取页面文本 + text = page.extract_text() + lines = text.split("\n") + print(lines) + for line in lines: + # print(line) + if '报告编号:' in line: + self.xlsx_table['报告编号'] = line.split("报告编号")[1].strip() + if '样品名称:' in line: + self.xlsx_table['样品名称'] = line.split("样品名称")[1].strip() + if '公司' in line and '中检华通' not in line: + print(line) + self.xlsx_table['公司名称'] = line.strip() + print(self.xlsx_table) + + def run(self): + self.pdf_text('../docs/CSTBB23040149+SST2304000301BB+英姿医疗科技(湖南)有限公司+宫腔内窥镜+低温等离子灭菌+耐受性试验(中文).pdf') + + +if __name__ == '__main__': + discern = Discern() + discern.run()