mirror of
https://github.com/luzhisheng/js_reverse.git
synced 2025-04-20 03:59:57 +08:00
pdf 处理
This commit is contained in:
parent
afc5efc175
commit
6ff0236524
Before Width: | Height: | Size: 4.4 KiB After Width: | Height: | Size: 4.4 KiB |
Before Width: | Height: | Size: 4.6 KiB After Width: | Height: | Size: 4.6 KiB |
@ -11,11 +11,12 @@ class Discern(object):
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.image_text_ocr = ImageTextOcr()
|
self.image_text_ocr = ImageTextOcr()
|
||||||
self.xlsx_keys = {}
|
self.xlsx_keys = {}
|
||||||
|
self.xlsx_keys_list = []
|
||||||
self.num = 0
|
self.num = 0
|
||||||
|
|
||||||
def export_excel(self, export):
|
def export_excel(self, export):
|
||||||
# 将字典列表转换为DataFrame
|
# 将字典列表转换为DataFrame
|
||||||
pf = pd.DataFrame(list([export]))
|
pf = pd.DataFrame(list(export))
|
||||||
file_path = pd.ExcelWriter('../docs/结果.xlsx')
|
file_path = pd.ExcelWriter('../docs/结果.xlsx')
|
||||||
# 替换空单元格
|
# 替换空单元格
|
||||||
pf.fillna(' ', inplace=True)
|
pf.fillna(' ', inplace=True)
|
||||||
@ -56,16 +57,16 @@ class Discern(object):
|
|||||||
for img in images:
|
for img in images:
|
||||||
# 获取图片的二进制流
|
# 获取图片的二进制流
|
||||||
self.num += 1
|
self.num += 1
|
||||||
image_file = f"../target_img/image_{self.num}.jpg"
|
image_file = f"../target_img/image_{self.num}.png"
|
||||||
with open(image_file, "wb") as f:
|
with open(image_file, "wb") as f:
|
||||||
f.write(img['stream'].get_data())
|
f.write(img['stream'].get_data())
|
||||||
|
|
||||||
def get_images_text(self):
|
def get_images_text(self):
|
||||||
for i in range(1, self.num + 1):
|
for i in range(1, self.num + 1):
|
||||||
try:
|
try:
|
||||||
cma_flag = image_compare.run(f'../target_img/image_{i}.jpg', '../img/cma.jpg')
|
cma_flag = image_compare.run(f'../target_img/image_{i}.png', '../img/cma.png')
|
||||||
cnas_flag = image_compare.run(f'../target_img/image_{i}.jpg', '../img/cnas.jpg')
|
cnas_flag = image_compare.run(f'../target_img/image_{i}.png', '../img/cnas.png')
|
||||||
text_list = self.image_text_ocr.run(f'../target_img/image_{i}.jpg')
|
text_list = self.image_text_ocr.run(f'../target_img/image_{i}.png')
|
||||||
except cv2.error as c:
|
except cv2.error as c:
|
||||||
pass
|
pass
|
||||||
if cma_flag:
|
if cma_flag:
|
||||||
@ -108,7 +109,8 @@ class Discern(object):
|
|||||||
self.pdf_text(file_path)
|
self.pdf_text(file_path)
|
||||||
self.pdf_images(file_path)
|
self.pdf_images(file_path)
|
||||||
self.get_images_text()
|
self.get_images_text()
|
||||||
self.export_excel(self.xlsx_keys)
|
self.xlsx_keys_list.append(self.xlsx_keys)
|
||||||
|
self.export_excel(self.xlsx_keys_list)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -272,26 +272,6 @@ CREATE TABLE `project_buyin_authorStatData` (
|
|||||||
UNIQUE KEY `task_id` (`task_id`,`deduplication`) USING BTREE
|
UNIQUE KEY `task_id` (`task_id`,`deduplication`) USING BTREE
|
||||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC;
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC;
|
||||||
/*!40101 SET character_set_client = @saved_cs_client */;
|
/*!40101 SET character_set_client = @saved_cs_client */;
|
||||||
|
|
||||||
--
|
|
||||||
-- Table structure for table `project_daduoduo_dy_Tiktok_search_Keyword`
|
|
||||||
--
|
|
||||||
|
|
||||||
DROP TABLE IF EXISTS `project_daduoduo_dy_Tiktok_search_Keyword`;
|
|
||||||
/*!40101 SET @saved_cs_client = @@character_set_client */;
|
|
||||||
/*!50503 SET character_set_client = utf8mb4 */;
|
|
||||||
CREATE TABLE `project_daduoduo_dy_Tiktok_search_Keyword` (
|
|
||||||
`task_id` varchar(100) DEFAULT NULL COMMENT '项目id',
|
|
||||||
`payload_get` text COMMENT 'get请求参数',
|
|
||||||
`payload_post` varchar(255) DEFAULT '' COMMENT 'post请求参数',
|
|
||||||
`deduplication` varchar(50) DEFAULT '' COMMENT '去重字段',
|
|
||||||
`weight` tinyint(1) DEFAULT '0' COMMENT '权重',
|
|
||||||
`status` tinyint(1) DEFAULT '0',
|
|
||||||
`create_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
|
|
||||||
`update_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
|
||||||
UNIQUE KEY `task_id` (`task_id`,`deduplication`) USING BTREE
|
|
||||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC;
|
|
||||||
/*!40101 SET character_set_client = @saved_cs_client */;
|
|
||||||
/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
|
/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
|
||||||
|
|
||||||
/*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
|
/*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
|
||||||
@ -302,4 +282,4 @@ CREATE TABLE `project_daduoduo_dy_Tiktok_search_Keyword` (
|
|||||||
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
|
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
|
||||||
/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
|
/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
|
||||||
|
|
||||||
-- Dump completed on 2023-07-19 17:35:07
|
-- Dump completed on 2023-07-24 11:36:31
|
||||||
|
Loading…
x
Reference in New Issue
Block a user