From 1b246234b50f51fc07316de8ecd59abf46975c0f Mon Sep 17 00:00:00 2001
From: aiyingfeng <aiyingfeng110@qq.com>
Date: Sat, 29 Jul 2023 15:01:50 +0800
Subject: [PATCH] =?UTF-8?q?=E6=8F=90=E5=8F=96pdf=E6=96=87=E4=BB=B6?=
 =?UTF-8?q?=E4=B8=AD=E5=9B=BE=E7=89=87=E5=92=8C=E6=96=87=E5=AD=97?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 pdf处理/README.md                 | 24 ++++++++++++++++++++++++
 pdf处理/program/image_text_ocr.py |  2 +-
 2 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 pdf处理/README.md

diff --git a/pdf处理/README.md b/pdf处理/README.md
new file mode 100644
index 0000000..16ca017
--- /dev/null
+++ b/pdf处理/README.md
@@ -0,0 +1,24 @@
+# 提取pdf文件中图片和文字
+
+## pdfimages提取图片
+
+pdfimages 是 Poppler 工具包的一部分，可以用于从 PDF 文件中提取图像。要使用 pdfimages，您需要安装 Poppler 工具包。
+
+1.在 Ubuntu 或 Debian 上安装 Poppler 工具包：
+```shell
+sudo apt-get update
+sudo apt-get install poppler-utils
+```
+
+2.提取所有图像并保存为 PNG 格式：
+```shell
+pdfimages -png ./file_test/1.pdf ./target_img/output_image
+```
+
+## pdfplumber提取文字
+```shell
+with pdfplumber.open(pdf_path) as pdf:
+    page = pdf.pages[0]
+    # 提取页面文本
+    text = page.extract_text()
+```
\ No newline at end of file
diff --git a/pdf处理/program/image_text_ocr.py b/pdf处理/program/image_text_ocr.py
index 8e1fc7a..f73ea6c 100644
--- a/pdf处理/program/image_text_ocr.py
+++ b/pdf处理/program/image_text_ocr.py
@@ -61,5 +61,5 @@ if __name__ == '__main__':
         '签发日期': '',
         '标志': ''
     }
-    res = image_text_ocr.run(text_dict, '../target_img/image_2.jpg')
+    res = image_text_ocr.run(text_dict, '../target_img/output_image-003.png')
     print(res)