From 1b246234b50f51fc07316de8ecd59abf46975c0f Mon Sep 17 00:00:00 2001 From: aiyingfeng Date: Sat, 29 Jul 2023 15:01:50 +0800 Subject: [PATCH] =?UTF-8?q?=E6=8F=90=E5=8F=96pdf=E6=96=87=E4=BB=B6?= =?UTF-8?q?=E4=B8=AD=E5=9B=BE=E7=89=87=E5=92=8C=E6=96=87=E5=AD=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pdf处理/README.md | 24 ++++++++++++++++++++++++ pdf处理/program/image_text_ocr.py | 2 +- 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 pdf处理/README.md diff --git a/pdf处理/README.md b/pdf处理/README.md new file mode 100644 index 0000000..16ca017 --- /dev/null +++ b/pdf处理/README.md @@ -0,0 +1,24 @@ +# 提取pdf文件中图片和文字 + +## pdfimages提取图片 + +pdfimages 是 Poppler 工具包的一部分,可以用于从 PDF 文件中提取图像。要使用 pdfimages,您需要安装 Poppler 工具包。 + +1.在 Ubuntu 或 Debian 上安装 Poppler 工具包: +```shell +sudo apt-get update +sudo apt-get install poppler-utils +``` + +2.提取所有图像并保存为 PNG 格式: +```shell +pdfimages -png ./file_test/1.pdf ./target_img/output_image +``` + +## pdfplumber提取文字 +```shell +with pdfplumber.open(pdf_path) as pdf: + page = pdf.pages[0] + # 提取页面文本 + text = page.extract_text() +``` \ No newline at end of file diff --git a/pdf处理/program/image_text_ocr.py b/pdf处理/program/image_text_ocr.py index 8e1fc7a..f73ea6c 100644 --- a/pdf处理/program/image_text_ocr.py +++ b/pdf处理/program/image_text_ocr.py @@ -61,5 +61,5 @@ if __name__ == '__main__': '签发日期': '', '标志': '' } - res = image_text_ocr.run(text_dict, '../target_img/image_2.jpg') + res = image_text_ocr.run(text_dict, '../target_img/output_image-003.png') print(res)