pdf 处理

This commit is contained in:
aiyingfeng 2023-07-24 17:46:55 +08:00
parent afc5efc175
commit 6ff0236524
4 changed files with 9 additions and 27 deletions

View File

Before

Width:  |  Height:  |  Size: 4.4 KiB

After

Width:  |  Height:  |  Size: 4.4 KiB

View File

Before

Width:  |  Height:  |  Size: 4.6 KiB

After

Width:  |  Height:  |  Size: 4.6 KiB

View File

@ -11,11 +11,12 @@ class Discern(object):
def __init__(self):
self.image_text_ocr = ImageTextOcr()
self.xlsx_keys = {}
self.xlsx_keys_list = []
self.num = 0
def export_excel(self, export):
# 将字典列表转换为DataFrame
pf = pd.DataFrame(list([export]))
pf = pd.DataFrame(list(export))
file_path = pd.ExcelWriter('../docs/结果.xlsx')
# 替换空单元格
pf.fillna(' ', inplace=True)
@ -56,16 +57,16 @@ class Discern(object):
for img in images:
# 获取图片的二进制流
self.num += 1
image_file = f"../target_img/image_{self.num}.jpg"
image_file = f"../target_img/image_{self.num}.png"
with open(image_file, "wb") as f:
f.write(img['stream'].get_data())
def get_images_text(self):
for i in range(1, self.num + 1):
try:
cma_flag = image_compare.run(f'../target_img/image_{i}.jpg', '../img/cma.jpg')
cnas_flag = image_compare.run(f'../target_img/image_{i}.jpg', '../img/cnas.jpg')
text_list = self.image_text_ocr.run(f'../target_img/image_{i}.jpg')
cma_flag = image_compare.run(f'../target_img/image_{i}.png', '../img/cma.png')
cnas_flag = image_compare.run(f'../target_img/image_{i}.png', '../img/cnas.png')
text_list = self.image_text_ocr.run(f'../target_img/image_{i}.png')
except cv2.error as c:
pass
if cma_flag:
@ -108,7 +109,8 @@ class Discern(object):
self.pdf_text(file_path)
self.pdf_images(file_path)
self.get_images_text()
self.export_excel(self.xlsx_keys)
self.xlsx_keys_list.append(self.xlsx_keys)
self.export_excel(self.xlsx_keys_list)
if __name__ == '__main__':

View File

@ -272,26 +272,6 @@ CREATE TABLE `project_buyin_authorStatData` (
UNIQUE KEY `task_id` (`task_id`,`deduplication`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC;
/*!40101 SET character_set_client = @saved_cs_client */;
--
-- Table structure for table `project_daduoduo_dy_Tiktok_search_Keyword`
--
DROP TABLE IF EXISTS `project_daduoduo_dy_Tiktok_search_Keyword`;
/*!40101 SET @saved_cs_client = @@character_set_client */;
/*!50503 SET character_set_client = utf8mb4 */;
CREATE TABLE `project_daduoduo_dy_Tiktok_search_Keyword` (
`task_id` varchar(100) DEFAULT NULL COMMENT '项目id',
`payload_get` text COMMENT 'get请求参数',
`payload_post` varchar(255) DEFAULT '' COMMENT 'post请求参数',
`deduplication` varchar(50) DEFAULT '' COMMENT '去重字段',
`weight` tinyint(1) DEFAULT '0' COMMENT '权重',
`status` tinyint(1) DEFAULT '0',
`create_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
`update_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
UNIQUE KEY `task_id` (`task_id`,`deduplication`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC;
/*!40101 SET character_set_client = @saved_cs_client */;
/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
/*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
@ -302,4 +282,4 @@ CREATE TABLE `project_daduoduo_dy_Tiktok_search_Keyword` (
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
-- Dump completed on 2023-07-19 17:35:07
-- Dump completed on 2023-07-24 11:36:31