mirror of
https://github.com/luzhisheng/js_reverse.git
synced 2025-04-12 03:27:07 +08:00
pdf 处理
This commit is contained in:
parent
afc5efc175
commit
6ff0236524
Before Width: | Height: | Size: 4.4 KiB After Width: | Height: | Size: 4.4 KiB |
Before Width: | Height: | Size: 4.6 KiB After Width: | Height: | Size: 4.6 KiB |
@ -11,11 +11,12 @@ class Discern(object):
|
||||
def __init__(self):
|
||||
self.image_text_ocr = ImageTextOcr()
|
||||
self.xlsx_keys = {}
|
||||
self.xlsx_keys_list = []
|
||||
self.num = 0
|
||||
|
||||
def export_excel(self, export):
|
||||
# 将字典列表转换为DataFrame
|
||||
pf = pd.DataFrame(list([export]))
|
||||
pf = pd.DataFrame(list(export))
|
||||
file_path = pd.ExcelWriter('../docs/结果.xlsx')
|
||||
# 替换空单元格
|
||||
pf.fillna(' ', inplace=True)
|
||||
@ -56,16 +57,16 @@ class Discern(object):
|
||||
for img in images:
|
||||
# 获取图片的二进制流
|
||||
self.num += 1
|
||||
image_file = f"../target_img/image_{self.num}.jpg"
|
||||
image_file = f"../target_img/image_{self.num}.png"
|
||||
with open(image_file, "wb") as f:
|
||||
f.write(img['stream'].get_data())
|
||||
|
||||
def get_images_text(self):
|
||||
for i in range(1, self.num + 1):
|
||||
try:
|
||||
cma_flag = image_compare.run(f'../target_img/image_{i}.jpg', '../img/cma.jpg')
|
||||
cnas_flag = image_compare.run(f'../target_img/image_{i}.jpg', '../img/cnas.jpg')
|
||||
text_list = self.image_text_ocr.run(f'../target_img/image_{i}.jpg')
|
||||
cma_flag = image_compare.run(f'../target_img/image_{i}.png', '../img/cma.png')
|
||||
cnas_flag = image_compare.run(f'../target_img/image_{i}.png', '../img/cnas.png')
|
||||
text_list = self.image_text_ocr.run(f'../target_img/image_{i}.png')
|
||||
except cv2.error as c:
|
||||
pass
|
||||
if cma_flag:
|
||||
@ -108,7 +109,8 @@ class Discern(object):
|
||||
self.pdf_text(file_path)
|
||||
self.pdf_images(file_path)
|
||||
self.get_images_text()
|
||||
self.export_excel(self.xlsx_keys)
|
||||
self.xlsx_keys_list.append(self.xlsx_keys)
|
||||
self.export_excel(self.xlsx_keys_list)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -272,26 +272,6 @@ CREATE TABLE `project_buyin_authorStatData` (
|
||||
UNIQUE KEY `task_id` (`task_id`,`deduplication`) USING BTREE
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC;
|
||||
/*!40101 SET character_set_client = @saved_cs_client */;
|
||||
|
||||
--
|
||||
-- Table structure for table `project_daduoduo_dy_Tiktok_search_Keyword`
|
||||
--
|
||||
|
||||
DROP TABLE IF EXISTS `project_daduoduo_dy_Tiktok_search_Keyword`;
|
||||
/*!40101 SET @saved_cs_client = @@character_set_client */;
|
||||
/*!50503 SET character_set_client = utf8mb4 */;
|
||||
CREATE TABLE `project_daduoduo_dy_Tiktok_search_Keyword` (
|
||||
`task_id` varchar(100) DEFAULT NULL COMMENT '项目id',
|
||||
`payload_get` text COMMENT 'get请求参数',
|
||||
`payload_post` varchar(255) DEFAULT '' COMMENT 'post请求参数',
|
||||
`deduplication` varchar(50) DEFAULT '' COMMENT '去重字段',
|
||||
`weight` tinyint(1) DEFAULT '0' COMMENT '权重',
|
||||
`status` tinyint(1) DEFAULT '0',
|
||||
`create_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
`update_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||
UNIQUE KEY `task_id` (`task_id`,`deduplication`) USING BTREE
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC;
|
||||
/*!40101 SET character_set_client = @saved_cs_client */;
|
||||
/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
|
||||
|
||||
/*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
|
||||
@ -302,4 +282,4 @@ CREATE TABLE `project_daduoduo_dy_Tiktok_search_Keyword` (
|
||||
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
|
||||
/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
|
||||
|
||||
-- Dump completed on 2023-07-19 17:35:07
|
||||
-- Dump completed on 2023-07-24 11:36:31
|
||||
|
Loading…
x
Reference in New Issue
Block a user