From 7b861e73c9169dbf3f7fff1767089ee659ac2894 Mon Sep 17 00:00:00 2001 From: aiyingfeng Date: Wed, 12 Jul 2023 17:45:51 +0800 Subject: [PATCH] =?UTF-8?q?=E6=8A=96=E5=BA=97=E7=B2=BE=E9=80=89=E8=81=94?= =?UTF-8?q?=E7=9B=9F=E6=95=B0=E6=8D=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../clean_buyin_authorStatData_authorOverviewV2.py | 116 ++++++++++++++++++ .../extractors/clean_buyin_authorStatData_seekAuthor.py | 45 +++++-- .../sql/baiyin/eb_supports_baiyin.sql | 38 +++++- .../抖店精选联盟数据/tool/精选联盟达人清单.py | 0 4 files changed, 185 insertions(+), 14 deletions(-) create mode 100644 抖音js逆向学习/抖店精选联盟数据/extractors/clean_buyin_authorStatData_authorOverviewV2.py create mode 100644 抖音js逆向学习/抖店精选联盟数据/tool/精选联盟达人清单.py diff --git a/抖音js逆向学习/抖店精选联盟数据/extractors/clean_buyin_authorStatData_authorOverviewV2.py b/抖音js逆向学习/抖店精选联盟数据/extractors/clean_buyin_authorStatData_authorOverviewV2.py new file mode 100644 index 0000000..ec033c4 --- /dev/null +++ b/抖音js逆向学习/抖店精选联盟数据/extractors/clean_buyin_authorStatData_authorOverviewV2.py @@ -0,0 +1,116 @@ +from base import Base +import json + + +class CleanBuyinAuthorStatDataAuthorOverviewV2(Base): + name = 'buyin_authorStatData_authorOverviewV2' + + def __init__(self): + super(CleanBuyinAuthorStatDataAuthorOverviewV2, self).__init__() + self.table = self.name + self.clean_table = "clean_" + self.table + + def process_item(self, resp): + list_res = [] + if not resp: + self.log(f'清洗{self.table}数据-不存在') + return '' + + for task_id, data, deduplication, update_time in resp: + live_data = json.loads(data).get('live_data') + video_data = json.loads(data).get('video_data') + item = { + "task_id": task_id, + "uid": deduplication.replace("uid=", ""), + "live_data_percentage": live_data.get('percentage'), + "live_data_count": live_data.get('count'), + "live_data_watching_num": live_data.get('watching_num'), + "live_data_sale_low": live_data.get('sale_low'), + "live_data_sale_high": live_data.get('sale_high'), + "live_data_sale_value": live_data.get('sale_value'), + "live_data_GPM_low": live_data.get('GPM_low'), + "live_data_GPM_high": live_data.get('GPM_high'), + "live_data_GPM_value": live_data.get('GPM_value'), + "live_data_sale_status": live_data.get('sale_status'), + "live_data_GPM_status": live_data.get('GPM_status'), + "live_data_recommend_rate": int(live_data.get('recommend_rate')), + "live_data_high_recommend_rate": 1 if live_data.get('high_recommend_rate') else 0, + "live_data_high_live_day": live_data.get('live_day'), + "live_data_high_live_product_num": live_data.get('live_product_num'), + "live_data_high_avg_live_online_dur": live_data.get('avg_live_online_dur'), + "live_data_high_avg_live_avg_peru_watch_dur": live_data.get('live_avg_peru_watch_dur'), + "live_data_high_avg_live_med_peru_watch_dur": live_data.get('live_med_peru_watch_dur'), + "live_data_high_avg_live_avg_price": live_data.get('live_avg_price'), + "video_data_percentage": video_data.get('percentage'), + "video_data_count": video_data.get('count'), + "video_data_watching_num": video_data.get('watching_num'), + "video_data_sale_low": video_data.get('sale_low'), + "video_data_sale_high": video_data.get('sale_high'), + "video_data_sale_value": video_data.get('sale_value'), + "video_data_GPM_low": video_data.get('GPM_low'), + "video_data_GPM_high": video_data.get('GPM_high'), + "video_data_GPM_value": video_data.get('GPM_value'), + "video_data_sale_status": video_data.get('sale_status'), + "video_data_GPM_status": video_data.get('GPM_status'), + "video_data_recommend_rate": int(video_data.get('recommend_rate')), + "video_data_high_recommend_rate": 1 if video_data.get('high_recommend_rate') else 0, + "video_data_live_day": video_data.get('live_day'), + "video_data_live_product_num": video_data.get('live_product_num'), + "video_data_avg_live_online_dur": video_data.get('avg_live_online_dur'), + "video_data_live_avg_peru_watch_dur": video_data.get('live_avg_peru_watch_dur'), + "video_data_live_med_peru_watch_dur": video_data.get('live_med_peru_watch_dur'), + "video_data_live_avg_price": video_data.get('live_avg_price'), + "deduplication": deduplication, + "spider_time": update_time + } + list_res.append(item) + db_res = self.eb_supports.insert_many(self.clean_table, + list_res, + conflict=[ + "task_id", "uid", "live_data_percentage", "live_data_count", + "live_data_watching_num", "live_data_sale_low", "live_data_sale_high", + "live_data_sale_value", "live_data_GPM_low", "live_data_GPM_high", + "live_data_GPM_value", "live_data_sale_status", + "live_data_GPM_status", + "live_data_recommend_rate", "live_data_high_recommend_rate", + "live_data_high_live_day", + "live_data_high_live_product_num", "live_data_high_live_product_num", + "live_data_high_avg_live_online_dur", + "live_data_high_avg_live_avg_peru_watch_dur", + "live_data_high_avg_live_med_peru_watch_dur", + "live_data_high_avg_live_avg_price", + "video_data_percentage", "video_data_count", + "video_data_watching_num", + "video_data_sale_low", "video_data_sale_high", + "video_data_sale_value", + "video_data_GPM_low", "video_data_GPM_high", + "video_data_GPM_value", + "video_data_sale_status", "video_data_sale_status", + "video_data_GPM_status", + "video_data_recommend_rate", "video_data_high_recommend_rate", + "video_data_live_day", + "video_data_live_product_num", "video_data_avg_live_online_dur", + "video_data_live_avg_peru_watch_dur", + "video_data_live_med_peru_watch_dur", "video_data_live_avg_price", + "deduplication", "spider_time" + ] + ) + if db_res >= 0: + return True, self.table, db_res + else: + return False, self.table, db_res + + +if __name__ == '__main__': + offset = 0 + qc = CleanBuyinAuthorStatDataAuthorOverviewV2() + while True: + sql = f""" + select task_id, data, deduplication, update_time from buyin_authorStatData_authorOverviewV2 where + date_sub(CURDATE(),INTERVAL 2 DAY) <= DATE(update_time) LIMIT 1000 OFFSET {offset}; + """ + res = qc.eb_supports.query(sql) + if not res: + break + qc.process_item(res) + offset += 1000 diff --git a/抖音js逆向学习/抖店精选联盟数据/extractors/clean_buyin_authorStatData_seekAuthor.py b/抖音js逆向学习/抖店精选联盟数据/extractors/clean_buyin_authorStatData_seekAuthor.py index c475bef..96ecfee 100644 --- a/抖音js逆向学习/抖店精选联盟数据/extractors/clean_buyin_authorStatData_seekAuthor.py +++ b/抖音js逆向学习/抖店精选联盟数据/extractors/clean_buyin_authorStatData_seekAuthor.py @@ -17,25 +17,46 @@ class CleanBuyinAuthorStatDataSeekAutho(Base): return '' for task_id, data, deduplication, update_time in resp: - contact_info = json.loads(data).get('contact_info') + author_base = json.loads(data).get('author_base') + author_tag = json.loads(data).get('author_tag') + uid = deduplication.split('&')[1] item = { "task_id": task_id, - "author_base_uid": deduplication.replace("uid=", ""), - "author_base_nickname": contact_info.get('times_left'), - "author_base_avatar": contact_info.get('contact_value'), - "author_base_fans_num": contact_info.get('contact_value'), - "author_base_gender": contact_info.get('contact_value'), - "author_base_city": contact_info.get('contact_value'), - "deduplication": deduplication + '×_left=' + str(contact_info.get('times_left')), + "author_base_uid": uid.replace("uid=", ""), + "author_base_nickname": author_base.get('nickname'), + "author_base_avatar": author_base.get('avatar'), + "author_base_fans_num": author_base.get('fans_num'), + "author_base_gender": author_base.get('gender'), + "author_base_city": author_base.get('city'), + "author_base_author_level": author_base.get('author_level'), + "author_base_avatar_big": author_base.get('avatar_big'), + "author_tag_work_cate": json.dumps(author_tag.get('work_cate')) if author_tag.get('work_cate') else '', + "author_tag_main_cate": json.dumps(author_tag.get('main_cate')), + "author_tag_dark_horse": author_tag.get('dark_horse'), + "author_tag_contact_icon": author_tag.get('contact_icon'), + "author_tag_high_reply": author_tag.get('high_reply'), + "author_tag_invitation_status": author_tag.get('invitation_status'), + "author_tag_invite_status": author_tag.get('invite_status'), + "author_tag_satisfy_requirement": author_tag.get('satisfy_requirement'), + "author_tag_already_cooperated": 1 if author_tag.get('already_cooperated') else 0, + "author_tag_is_star": 1 if author_tag.get('is_star') else 0, + "deduplication": uid, "spider_time": update_time } list_res.append(item) db_res = self.eb_supports.insert_many(self.clean_table, list_res, conflict=[ - "task_id", "uid", "times_left", - "contact_value", "contact_value", "deduplication", - "spider_time" + "task_id", "author_base_uid", "author_base_nickname", + "author_base_avatar", "author_base_fans_num", + "author_base_gender", "author_base_city", + "author_base_author_level", "author_base_avatar_big", + "author_tag_work_cate", "author_tag_main_cate", + "author_tag_dark_horse", "author_tag_contact_icon", + "author_tag_high_reply", "author_tag_invitation_status", + "author_tag_invite_status", "author_tag_satisfy_requirement", + "author_tag_already_cooperated", "author_tag_is_star", + "deduplication", "spider_time" ] ) if db_res >= 0: @@ -50,7 +71,7 @@ if __name__ == '__main__': while True: sql = f""" select task_id, data, deduplication, update_time from buyin_authorStatData_seekAuthor where - date_sub(CURDATE(),INTERVAL 2 DAY) <= DATE(update_time) LIMIT 1000 OFFSET {offset}; + date_sub(CURDATE(),INTERVAL 5 DAY) <= DATE(update_time) LIMIT 1000 OFFSET {offset}; """ res = qc.eb_supports.query(sql) if not res: diff --git a/抖音js逆向学习/抖店精选联盟数据/sql/baiyin/eb_supports_baiyin.sql b/抖音js逆向学习/抖店精选联盟数据/sql/baiyin/eb_supports_baiyin.sql index fc88436..51c7e58 100644 --- a/抖音js逆向学习/抖店精选联盟数据/sql/baiyin/eb_supports_baiyin.sql +++ b/抖音js逆向学习/抖店精选联盟数据/sql/baiyin/eb_supports_baiyin.sql @@ -29,7 +29,7 @@ CREATE TABLE `buyin_authorStatData_authorOverviewV2` ( `status` smallint(6) DEFAULT '0' COMMENT '状态', `create_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP, `update_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, - UNIQUE KEY `task_id` (`task_id`,`deduplication`) USING BTREE + KEY `task_id` (`task_id`,`deduplication`) USING BTREE ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC; /*!40101 SET character_set_client = @saved_cs_client */; @@ -69,6 +69,40 @@ CREATE TABLE `buyin_contact_info` ( ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC; /*!40101 SET character_set_client = @saved_cs_client */; +-- +-- Table structure for table `clean_buyin_authorStatData_seekAuthor` +-- + +DROP TABLE IF EXISTS `clean_buyin_authorStatData_seekAuthor`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `clean_buyin_authorStatData_seekAuthor` ( + `task_id` varchar(100) DEFAULT NULL COMMENT '项目id', + `author_base_uid` varchar(100) DEFAULT '' COMMENT '唯一标识符', + `author_base_nickname` varchar(100) DEFAULT '' COMMENT '昵称', + `author_base_avatar` varchar(100) DEFAULT '' COMMENT '头像', + `author_base_fans_num` int(11) DEFAULT '0' COMMENT '粉丝数', + `author_base_gender` smallint(11) DEFAULT '0' COMMENT '性别', + `author_base_city` varchar(50) DEFAULT '' COMMENT '城市', + `author_base_author_level` smallint(1) DEFAULT '0' COMMENT '等级', + `author_base_avatar_big` varchar(100) DEFAULT '' COMMENT '头像', + `author_tag_work_cate` varchar(100) DEFAULT '' COMMENT '工作类别', + `author_tag_main_cate` varchar(100) DEFAULT '' COMMENT '主类别', + `author_tag_dark_horse` varchar(50) DEFAULT '' COMMENT '黑马', + `author_tag_contact_icon` varchar(50) DEFAULT '' COMMENT '联系方式', + `author_tag_high_reply` varchar(50) DEFAULT '' COMMENT '高回复', + `author_tag_invitation_status` smallint(1) DEFAULT '0' COMMENT '邀请状态', + `author_tag_invite_status` smallint(1) DEFAULT '0' COMMENT '邀请状态', + `author_tag_satisfy_requirement` smallint(1) DEFAULT '0' COMMENT '满足要求', + `author_tag_already_cooperated` smallint(1) DEFAULT '0' COMMENT '已经合作过', + `author_tag_is_star` smallint(1) DEFAULT '0' COMMENT '是否明星', + `deduplication` varchar(100) DEFAULT '' COMMENT '去重字段', + `spider_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP COMMENT '爬虫抓取时间', + UNIQUE KEY `task_id` (`deduplication`) USING BTREE, + KEY `author_base_uid` (`author_base_uid`) USING BTREE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC; +/*!40101 SET character_set_client = @saved_cs_client */; + -- -- Table structure for table `clean_buyin_contact_info` -- @@ -117,4 +151,4 @@ CREATE TABLE `project_buyin_authorStatData` ( /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; --- Dump completed on 2023-07-11 20:41:43 +-- Dump completed on 2023-07-12 17:09:39 diff --git a/抖音js逆向学习/抖店精选联盟数据/tool/精选联盟达人清单.py b/抖音js逆向学习/抖店精选联盟数据/tool/精选联盟达人清单.py new file mode 100644 index 0000000..e69de29