From 5bff450e38bfe2f75626f64336d9c4c597044931 Mon Sep 17 00:00:00 2001 From: aiyingfeng Date: Tue, 11 Jul 2023 21:00:24 +0800 Subject: [PATCH] =?UTF-8?q?=E6=8A=96=E5=BA=97=E7=B2=BE=E9=80=89=E8=81=94?= =?UTF-8?q?=E7=9B=9F=E6=95=B0=E6=8D=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 抖音js逆向学习/抖店精选联盟数据/README.md | 14 ++++- .../抖店精选联盟数据/{dispatch => }/base.py | 0 .../dispatch/创建巨量百应主播详情爬虫.py | 2 +- .../extractors/clean_buyin_authorStatData_seekAuthor.py | 59 ++++++++++++++++++ .../extractors/clean_buyin_contact_info.py | 56 +++++++++++++++++ .../抖店精选联盟数据/spider/base.py | 12 ---- .../抖店精选联盟数据/spider/browser_baiyin.py | 2 +- .../spider/buyin_author_statData_mitm.py | 2 +- .../sql/baiyin/eb_supports_baiyin.sql | 61 ++++++++++++++++++- 9 files changed, 190 insertions(+), 18 deletions(-) rename 抖音js逆向学习/抖店精选联盟数据/{dispatch => }/base.py (100%) create mode 100644 抖音js逆向学习/抖店精选联盟数据/extractors/clean_buyin_authorStatData_seekAuthor.py create mode 100644 抖音js逆向学习/抖店精选联盟数据/extractors/clean_buyin_contact_info.py delete mode 100644 抖音js逆向学习/抖店精选联盟数据/spider/base.py diff --git a/抖音js逆向学习/抖店精选联盟数据/README.md b/抖音js逆向学习/抖店精选联盟数据/README.md index ce87004..d98a427 100644 --- a/抖音js逆向学习/抖店精选联盟数据/README.md +++ b/抖音js逆向学习/抖店精选联盟数据/README.md @@ -1,3 +1,15 @@ # 文档 - mitmdump -s ./spider/buyin_authorStatData_seekAuthor_mitm.py -p 9999 -q \ No newline at end of file +## 如何运行 + +启动代理 + + mitmdump -s ./main_spider.py -q -p 9999 + +启动浏览器 + + google-chrome --remote-debugging-port=9222 --user-data-dir='/home/ayf/project/js_reverse/抖音js逆向学习/抖店精选联盟数据/cn' + +启动脚本控制浏览器 + + browser_baiyin.py \ No newline at end of file diff --git a/抖音js逆向学习/抖店精选联盟数据/dispatch/base.py b/抖音js逆向学习/抖店精选联盟数据/base.py similarity index 100% rename from 抖音js逆向学习/抖店精选联盟数据/dispatch/base.py rename to 抖音js逆向学习/抖店精选联盟数据/base.py diff --git a/抖音js逆向学习/抖店精选联盟数据/dispatch/创建巨量百应主播详情爬虫.py b/抖音js逆向学习/抖店精选联盟数据/dispatch/创建巨量百应主播详情爬虫.py index cd8d6f1..7a4eecc 100644 --- a/抖音js逆向学习/抖店精选联盟数据/dispatch/创建巨量百应主播详情爬虫.py +++ b/抖音js逆向学习/抖店精选联盟数据/dispatch/创建巨量百应主播详情爬虫.py @@ -1,4 +1,4 @@ -from dispatch.base import Base +from base import Base import datetime import json diff --git a/抖音js逆向学习/抖店精选联盟数据/extractors/clean_buyin_authorStatData_seekAuthor.py b/抖音js逆向学习/抖店精选联盟数据/extractors/clean_buyin_authorStatData_seekAuthor.py new file mode 100644 index 0000000..c475bef --- /dev/null +++ b/抖音js逆向学习/抖店精选联盟数据/extractors/clean_buyin_authorStatData_seekAuthor.py @@ -0,0 +1,59 @@ +from base import Base +import json + + +class CleanBuyinAuthorStatDataSeekAutho(Base): + name = 'buyin_authorStatData_seekAuthor' + + def __init__(self): + super(CleanBuyinAuthorStatDataSeekAutho, self).__init__() + self.table = self.name + self.clean_table = "clean_" + self.table + + def process_item(self, resp): + list_res = [] + if not resp: + self.log(f'清洗{self.table}数据-不存在') + return '' + + for task_id, data, deduplication, update_time in resp: + contact_info = json.loads(data).get('contact_info') + item = { + "task_id": task_id, + "author_base_uid": deduplication.replace("uid=", ""), + "author_base_nickname": contact_info.get('times_left'), + "author_base_avatar": contact_info.get('contact_value'), + "author_base_fans_num": contact_info.get('contact_value'), + "author_base_gender": contact_info.get('contact_value'), + "author_base_city": contact_info.get('contact_value'), + "deduplication": deduplication + '×_left=' + str(contact_info.get('times_left')), + "spider_time": update_time + } + list_res.append(item) + db_res = self.eb_supports.insert_many(self.clean_table, + list_res, + conflict=[ + "task_id", "uid", "times_left", + "contact_value", "contact_value", "deduplication", + "spider_time" + ] + ) + if db_res >= 0: + return True, self.table, db_res + else: + return False, self.table, db_res + + +if __name__ == '__main__': + offset = 0 + qc = CleanBuyinAuthorStatDataSeekAutho() + while True: + sql = f""" + select task_id, data, deduplication, update_time from buyin_authorStatData_seekAuthor where + date_sub(CURDATE(),INTERVAL 2 DAY) <= DATE(update_time) LIMIT 1000 OFFSET {offset}; + """ + res = qc.eb_supports.query(sql) + if not res: + break + qc.process_item(res) + offset += 1000 diff --git a/抖音js逆向学习/抖店精选联盟数据/extractors/clean_buyin_contact_info.py b/抖音js逆向学习/抖店精选联盟数据/extractors/clean_buyin_contact_info.py new file mode 100644 index 0000000..bcd9e3b --- /dev/null +++ b/抖音js逆向学习/抖店精选联盟数据/extractors/clean_buyin_contact_info.py @@ -0,0 +1,56 @@ +from base import Base +import json + + +class CleanBuyinContactInfo(Base): + name = 'buyin_contact_info' + + def __init__(self): + super(CleanBuyinContactInfo, self).__init__() + self.table = self.name + self.clean_table = "clean_" + self.table + + def process_item(self, resp): + list_res = [] + if not resp: + self.log(f'清洗{self.table}数据-不存在') + return '' + + for task_id, data, deduplication, update_time in resp: + contact_info = json.loads(data).get('contact_info') + item = { + "task_id": task_id, + "uid": deduplication.replace("uid=", ""), + "times_left": contact_info.get('times_left'), + "contact_value": contact_info.get('contact_value'), + "deduplication": deduplication + '×_left=' + str(contact_info.get('times_left')), + "spider_time": update_time + } + list_res.append(item) + db_res = self.eb_supports.insert_many(self.clean_table, + list_res, + conflict=[ + "task_id", "uid", "times_left", + "contact_value", "contact_value", "deduplication", + "spider_time" + ] + ) + if db_res >= 0: + return True, self.table, db_res + else: + return False, self.table, db_res + + +if __name__ == '__main__': + offset = 0 + qc = CleanBuyinContactInfo() + while True: + sql = f""" + select task_id, data, deduplication, update_time from buyin_contact_info where + date_sub(CURDATE(),INTERVAL 2 DAY) <= DATE(update_time) LIMIT 1000 OFFSET {offset}; + """ + res = qc.eb_supports.query(sql) + if not res: + break + qc.process_item(res) + offset += 1000 diff --git a/抖音js逆向学习/抖店精选联盟数据/spider/base.py b/抖音js逆向学习/抖店精选联盟数据/spider/base.py deleted file mode 100644 index 0e525d9..0000000 --- a/抖音js逆向学习/抖店精选联盟数据/spider/base.py +++ /dev/null @@ -1,12 +0,0 @@ -from dao.mysql_dao import StoreMysqlPool -from datetime import datetime -import settings - - -class Base(object): - - def __init__(self): - self.eb_supports = StoreMysqlPool(**settings.mysql_server_baiyin) - - def log(self, s): - print('【%s】 %s' % (datetime.now(), s), flush=True) diff --git a/抖音js逆向学习/抖店精选联盟数据/spider/browser_baiyin.py b/抖音js逆向学习/抖店精选联盟数据/spider/browser_baiyin.py index 22ee8ef..b54a904 100644 --- a/抖音js逆向学习/抖店精选联盟数据/spider/browser_baiyin.py +++ b/抖音js逆向学习/抖店精选联盟数据/spider/browser_baiyin.py @@ -1,7 +1,7 @@ from multiprocessing import Queue from selenium.webdriver.common.by import By from selenium import webdriver -from spider.base import Base +from base import Base import time diff --git a/抖音js逆向学习/抖店精选联盟数据/spider/buyin_author_statData_mitm.py b/抖音js逆向学习/抖店精选联盟数据/spider/buyin_author_statData_mitm.py index 06fa362..98e7fa5 100644 --- a/抖音js逆向学习/抖店精选联盟数据/spider/buyin_author_statData_mitm.py +++ b/抖音js逆向学习/抖店精选联盟数据/spider/buyin_author_statData_mitm.py @@ -1,4 +1,4 @@ -from spider.base import Base +from base import Base from urllib.parse import parse_qsl, urlsplit import json diff --git a/抖音js逆向学习/抖店精选联盟数据/sql/baiyin/eb_supports_baiyin.sql b/抖音js逆向学习/抖店精选联盟数据/sql/baiyin/eb_supports_baiyin.sql index 0e3d521..fc88436 100644 --- a/抖音js逆向学习/抖店精选联盟数据/sql/baiyin/eb_supports_baiyin.sql +++ b/抖音js逆向学习/抖店精选联盟数据/sql/baiyin/eb_supports_baiyin.sql @@ -25,7 +25,7 @@ DROP TABLE IF EXISTS `buyin_authorStatData_authorOverviewV2`; CREATE TABLE `buyin_authorStatData_authorOverviewV2` ( `task_id` varchar(100) DEFAULT NULL COMMENT '项目id', `data` mediumtext COMMENT '数据结果', - `deduplication` varchar(100) DEFAULT '' COMMENT '去重字段', + `deduplication` varchar(150) DEFAULT '' COMMENT '去重字段', `status` smallint(6) DEFAULT '0' COMMENT '状态', `create_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP, `update_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, @@ -50,6 +50,63 @@ CREATE TABLE `buyin_authorStatData_seekAuthor` ( UNIQUE KEY `task_id` (`task_id`,`deduplication`) USING BTREE ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC; /*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Table structure for table `buyin_contact_info` +-- + +DROP TABLE IF EXISTS `buyin_contact_info`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `buyin_contact_info` ( + `task_id` varchar(100) DEFAULT NULL COMMENT '项目id', + `data` mediumtext COMMENT '数据结果', + `deduplication` varchar(100) DEFAULT '' COMMENT '去重字段', + `status` smallint(6) DEFAULT '0' COMMENT '状态', + `create_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP, + `update_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + KEY `task_id` (`task_id`,`deduplication`) USING BTREE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Table structure for table `clean_buyin_contact_info` +-- + +DROP TABLE IF EXISTS `clean_buyin_contact_info`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `clean_buyin_contact_info` ( + `task_id` varchar(100) DEFAULT NULL COMMENT '项目id', + `uid` varchar(100) DEFAULT '' COMMENT '唯一标识符', + `times_left` varchar(50) DEFAULT '', + `contact_value` varchar(50) DEFAULT '' COMMENT '联系方式', + `deduplication` varchar(100) DEFAULT '' COMMENT '去重字段', + `spider_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP COMMENT '爬虫抓取时间', + UNIQUE KEY `task_id` (`deduplication`) USING BTREE, + KEY `uid` (`uid`) USING BTREE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Table structure for table `project_buyin_authorStatData` +-- + +DROP TABLE IF EXISTS `project_buyin_authorStatData`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `project_buyin_authorStatData` ( + `task_id` varchar(100) DEFAULT NULL COMMENT '项目id', + `payload_get` text COMMENT 'get请求参数', + `payload_post` varchar(255) DEFAULT '' COMMENT 'post请求参数', + `deduplication` varchar(150) DEFAULT '' COMMENT '去重字段', + `weight` tinyint(4) DEFAULT NULL COMMENT '权重', + `status` tinyint(1) DEFAULT '0', + `create_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP, + `update_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + UNIQUE KEY `task_id` (`task_id`,`deduplication`) USING BTREE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC; +/*!40101 SET character_set_client = @saved_cs_client */; /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; @@ -60,4 +117,4 @@ CREATE TABLE `buyin_authorStatData_seekAuthor` ( /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; --- Dump completed on 2023-07-11 10:53:37 +-- Dump completed on 2023-07-11 20:41:43