diff --git a/抖音js逆向学习/抖店精选联盟数据/dispatch/创建巨量百应主播详情爬虫.py b/抖音js逆向学习/抖店精选联盟数据/dispatch/创建巨量百应主播详情爬虫.py index 7a4eecc..f386db2 100644 --- a/抖音js逆向学习/抖店精选联盟数据/dispatch/创建巨量百应主播详情爬虫.py +++ b/抖音js逆向学习/抖店精选联盟数据/dispatch/创建巨量百应主播详情爬虫.py @@ -1,6 +1,5 @@ from base import Base import datetime -import json class 创建巨量百应主播详情爬虫(Base): @@ -44,17 +43,17 @@ if __name__ == '__main__': while True: sql = f""" SELECT - data, deduplication + uid, + log_id FROM - buyin_authorStatData_seekAuthor + clean_buyin_authorStatData_seekAuthor + WHERE + author_base_uid NOT IN (SELECT uid FROM clean_buyin_contact_info) LIMIT 1000 OFFSET {offset} """ msg = d.eb_supports.query(sql) list_dict = [] - for data, deduplication in msg: - data = json.loads(data) - log_id = deduplication.split('&')[0] - uid = data.get('author_base').get('uid') + for uid, log_id in msg: item = {"task_id": task_id, "uid": uid, "log_id": log_id} list_dict.append(item) if list_dict: diff --git a/抖音js逆向学习/抖店精选联盟数据/extractors/clean_buyin_authorStatData_authorProfile.py b/抖音js逆向学习/抖店精选联盟数据/extractors/clean_buyin_authorStatData_authorProfile.py new file mode 100644 index 0000000..d82b1b8 --- /dev/null +++ b/抖音js逆向学习/抖店精选联盟数据/extractors/clean_buyin_authorStatData_authorProfile.py @@ -0,0 +1,105 @@ +from base import Base +import json + + +class CleanBuyinAuthorStatDataAuthorProfile(Base): + name = 'buyin_authorStatData_authorProfile' + + def __init__(self): + super(CleanBuyinAuthorStatDataAuthorProfile, self).__init__() + self.table = self.name + self.clean_table = "clean_" + self.table + + def process_item(self, resp): + list_res = [] + if not resp: + self.log(f'清洗{self.table}数据-不存在') + return '' + + for task_id, data, deduplication, update_time in resp: + data = json.loads(data) + item = { + "task_id": task_id, + "uid": deduplication.replace("uid=", ""), + "level": data.get('level'), + "account_douyin": data.get('account_douyin'), + "gender": data.get('gender'), + "city": data.get('city'), + "bind_lark_status": 1 if data.get('bind_lark_status') else 0, + "nickname": data.get('nickname'), + "fans_sum": data.get('fans_sum'), + "works_type": str(data.get('works_type')) if data.get('works_type') else '', + "agency": data.get('agency'), + "product_main_type": str(data.get('product_main_type')) if data.get('product_main_type') else '', + "product_main_type_array": str(data.get('product_main_type_array')) if data.get('product_main_type_array') else '', + "score": data.get('score'), + "reputation_level": data.get('reputation_level'), + "special_price": data.get('special_price'), + "join_price": data.get('join_price'), + "bargaining": data.get('bargaining'), + "duration": data.get('duration'), + "in_business": data.get('in_business'), + "introduction": data.get('introduction'), + "sale_type": data.get('sale_type'), + "rec_reason": str(data.get('rec_reason')) if data.get('rec_reason') else '', + "daren_plaza_rec_reason": str(data.get('daren_plaza_rec_reason')) if data.get('daren_plaza_rec_reason') else '', + "avatar": data.get('avatar'), + "error_msg": data.get('error_msg'), + "share_url_douyin": data.get('share_url_douyin'), + "recommend_reasons": str(data.get('recommend_reasons')) if data.get('recommend_reasons') else '', + "credit_score": data.get('credit_score'), + "intention_catgory": str(data.get('intention_catgory')) if data.get('intention_catgory') else '', + "cooperate_mode": data.get('cooperate_mode'), + "commission_ratio": data.get('commission_ratio'), + "sell_requirement": str(data.get('sell_requirement')) if data.get('sell_requirement') else '', + "dark_horses": str(data.get('dark_horses')) if data.get('dark_horses') else '', + "high_online_reply_rate": 1 if data.get('high_online_reply_rate') else 0, + "high_invitation_reply_rate": 1 if data.get('high_invitation_reply_rate') else 0, + "insitution_id": data.get('insitution_id'), + "web_homepage_url": data.get('web_homepage_url'), + "high_cooperation": 1 if data.get('high_cooperation') else 0, + "is_star": 1 if data.get('is_star') else 0, + "tags": str(data.get('tags')) if data.get('tags') else '', + "act_info": data.get('act_info') if data.get('act_info') else '', + "deduplication": deduplication, + "spider_time": update_time + } + list_res.append(item) + db_res = self.eb_supports.insert_many(self.clean_table, + list_res, + conflict=[ + "task_id", "uid", "level", "account_douyin", "gender", + "city", "bind_lark_status", "nickname", "fans_sum", + "works_type", "agency", "product_main_type", + "product_main_type_array", "score", "reputation_level", + "special_price", "join_price", + "bargaining", "duration", "in_business", "introduction", + "sale_type", "rec_reason", "daren_plaza_rec_reason", "avatar", + "error_msg", "share_url_douyin", "recommend_reasons", "credit_score", + "intention_catgory", "cooperate_mode", "commission_ratio", + "sell_requirement", + "sell_requirement", "dark_horses", "high_online_reply_rate", + "high_invitation_reply_rate", "high_invitation_reply_rate", + "insitution_id", "web_homepage_url", "high_cooperation", + "is_star", "tags", "act_info", "deduplication", "spider_time" + ] + ) + if db_res >= 0: + return True, self.table, db_res + else: + return False, self.table, db_res + + +if __name__ == '__main__': + offset = 0 + qc = CleanBuyinAuthorStatDataAuthorProfile() + while True: + sql = f""" + select task_id, data, deduplication, update_time from buyin_authorStatData_authorProfile where + date_sub(CURDATE(),INTERVAL 5 DAY) <= DATE(update_time) LIMIT 1000 OFFSET {offset}; + """ + res = qc.eb_supports.query(sql) + if not res: + break + qc.process_item(res) + offset += 1000 diff --git a/抖音js逆向学习/抖店精选联盟数据/extractors/clean_buyin_authorStatData_seekAuthor.py b/抖音js逆向学习/抖店精选联盟数据/extractors/clean_buyin_authorStatData_seekAuthor.py index 96ecfee..f179bca 100644 --- a/抖音js逆向学习/抖店精选联盟数据/extractors/clean_buyin_authorStatData_seekAuthor.py +++ b/抖音js逆向学习/抖店精选联盟数据/extractors/clean_buyin_authorStatData_seekAuthor.py @@ -20,8 +20,11 @@ class CleanBuyinAuthorStatDataSeekAutho(Base): author_base = json.loads(data).get('author_base') author_tag = json.loads(data).get('author_tag') uid = deduplication.split('&')[1] + log_id = deduplication.split('&')[0] item = { "task_id": task_id, + "uid": author_base.get('uid'), + "log_id": log_id, "author_base_uid": uid.replace("uid=", ""), "author_base_nickname": author_base.get('nickname'), "author_base_avatar": author_base.get('avatar'), @@ -47,7 +50,7 @@ class CleanBuyinAuthorStatDataSeekAutho(Base): db_res = self.eb_supports.insert_many(self.clean_table, list_res, conflict=[ - "task_id", "author_base_uid", "author_base_nickname", + "task_id", "uid", "log_id", "author_base_uid", "author_base_nickname", "author_base_avatar", "author_base_fans_num", "author_base_gender", "author_base_city", "author_base_author_level", "author_base_avatar_big", diff --git a/抖音js逆向学习/抖店精选联盟数据/spider/browser_baiyin.py b/抖音js逆向学习/抖店精选联盟数据/spider/browser_baiyin.py index 6b4ebfc..b54a904 100644 --- a/抖音js逆向学习/抖店精选联盟数据/spider/browser_baiyin.py +++ b/抖音js逆向学习/抖店精选联盟数据/spider/browser_baiyin.py @@ -57,18 +57,18 @@ class BrowserBaiyin(Base): deduplication = project_item[3] self.browser.get(payload_get) time.sleep(15) - # elements_img = self.browser.find_elements(By.XPATH, '//div[@class="contact_way_info_block_item"]' - # '//img[@elementtiming="element-timing"]') - # elements_img[0].click() - # time.sleep(5) - # elements_ck = self.browser.find_elements(By.XPATH, '//button[@class="auxo-btn auxo-btn-primary"]' - # '/span[text()="查看"]') - # if elements_ck: - # elements_ck[0].click() - # - # if len(elements_img) > 1: - # elements_img[1].click() - # time.sleep(5) + elements_img = self.browser.find_elements(By.XPATH, '//div[@class="contact_way_info_block_item"]' + '//img[@elementtiming="element-timing"]') + elements_img[0].click() + time.sleep(5) + elements_ck = self.browser.find_elements(By.XPATH, '//button[@class="auxo-btn auxo-btn-primary"]' + '/span[text()="查看"]') + if elements_ck: + elements_ck[0].click() + + if len(elements_img) > 1: + elements_img[1].click() + time.sleep(5) sql = f"SELECT task_id FROM {self.table} where deduplication = '{deduplication}' limit 1" msg = self.eb_supports.query(sql) if msg: diff --git a/抖音js逆向学习/抖店精选联盟数据/sql/baiyin/eb_supports_baiyin.sql b/抖音js逆向学习/抖店精选联盟数据/sql/baiyin/eb_supports_baiyin.sql index 51c7e58..fc86503 100644 --- a/抖音js逆向学习/抖店精选联盟数据/sql/baiyin/eb_supports_baiyin.sql +++ b/抖音js逆向学习/抖店精选联盟数据/sql/baiyin/eb_supports_baiyin.sql @@ -33,6 +33,24 @@ CREATE TABLE `buyin_authorStatData_authorOverviewV2` ( ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC; /*!40101 SET character_set_client = @saved_cs_client */; +-- +-- Table structure for table `buyin_authorStatData_authorProfile` +-- + +DROP TABLE IF EXISTS `buyin_authorStatData_authorProfile`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `buyin_authorStatData_authorProfile` ( + `task_id` varchar(100) DEFAULT NULL COMMENT '项目id', + `data` mediumtext COMMENT '数据结果', + `deduplication` varchar(100) DEFAULT '' COMMENT '去重字段', + `status` smallint(6) DEFAULT '0' COMMENT '状态', + `create_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP, + `update_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + KEY `task_id` (`task_id`,`deduplication`) USING BTREE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC; +/*!40101 SET character_set_client = @saved_cs_client */; + -- -- Table structure for table `buyin_authorStatData_seekAuthor` -- @@ -69,6 +87,118 @@ CREATE TABLE `buyin_contact_info` ( ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC; /*!40101 SET character_set_client = @saved_cs_client */; +-- +-- Table structure for table `clean_buyin_authorStatData_authorOverviewV2` +-- + +DROP TABLE IF EXISTS `clean_buyin_authorStatData_authorOverviewV2`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `clean_buyin_authorStatData_authorOverviewV2` ( + `task_id` varchar(100) DEFAULT NULL COMMENT '项目id', + `uid` varchar(100) DEFAULT '' COMMENT '唯一标识符', + `live_data_percentage` int(11) DEFAULT '0', + `live_data_count` int(11) DEFAULT '0', + `live_data_watching_num` int(11) DEFAULT '0', + `live_data_sale_low` int(11) DEFAULT '0', + `live_data_sale_high` int(11) DEFAULT '0', + `live_data_sale_value` int(11) DEFAULT '0', + `live_data_GPM_low` int(11) DEFAULT '0', + `live_data_GPM_high` int(11) DEFAULT '0', + `live_data_GPM_value` int(11) DEFAULT '0', + `live_data_sale_status` int(11) DEFAULT '0', + `live_data_GPM_status` int(11) DEFAULT '0', + `live_data_recommend_rate` int(11) DEFAULT '0', + `live_data_high_recommend_rate` int(11) DEFAULT '0', + `live_data_high_live_day` int(11) DEFAULT '0', + `live_data_high_live_product_num` int(11) DEFAULT '0', + `live_data_high_avg_live_online_dur` int(11) DEFAULT '0', + `live_data_high_avg_live_avg_peru_watch_dur` int(11) DEFAULT '0', + `live_data_high_avg_live_med_peru_watch_dur` int(11) DEFAULT '0', + `live_data_high_avg_live_avg_price` int(11) DEFAULT '0', + `video_data_percentage` int(11) DEFAULT '0', + `video_data_count` int(11) DEFAULT '0', + `video_data_watching_num` int(11) DEFAULT '0', + `video_data_sale_low` int(11) DEFAULT '0', + `video_data_sale_high` int(11) DEFAULT '0', + `video_data_sale_value` int(11) DEFAULT '0', + `video_data_GPM_low` int(11) DEFAULT '0', + `video_data_GPM_high` int(11) DEFAULT '0', + `video_data_GPM_value` int(11) DEFAULT '0', + `video_data_sale_status` int(11) DEFAULT '0', + `video_data_GPM_status` int(11) DEFAULT '0', + `video_data_recommend_rate` int(11) DEFAULT '0', + `video_data_high_recommend_rate` int(11) DEFAULT '0', + `video_data_live_day` int(11) DEFAULT '0', + `video_data_live_product_num` int(11) DEFAULT '0', + `video_data_avg_live_online_dur` int(11) DEFAULT '0', + `video_data_live_avg_peru_watch_dur` int(11) DEFAULT '0', + `video_data_live_med_peru_watch_dur` int(11) DEFAULT '0', + `video_data_live_avg_price` int(11) DEFAULT '0', + `deduplication` varchar(100) DEFAULT '' COMMENT '去重字段', + `spider_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP COMMENT '爬虫抓取时间', + UNIQUE KEY `deduplication` (`deduplication`) USING BTREE, + KEY `uid` (`uid`) USING BTREE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Table structure for table `clean_buyin_authorStatData_authorProfile` +-- + +DROP TABLE IF EXISTS `clean_buyin_authorStatData_authorProfile`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `clean_buyin_authorStatData_authorProfile` ( + `task_id` varchar(100) DEFAULT NULL COMMENT '项目id', + `uid` varchar(100) DEFAULT '' COMMENT '唯一标识符', + `level` smallint(1) DEFAULT '0' COMMENT '等级', + `account_douyin` varchar(50) DEFAULT '' COMMENT 'douyin_id', + `gender` smallint(1) DEFAULT '0' COMMENT '性别', + `city` varchar(50) DEFAULT '' COMMENT '城市', + `bind_lark_status` smallint(1) DEFAULT '0', + `nickname` varchar(50) DEFAULT '' COMMENT '昵称', + `fans_sum` varchar(50) DEFAULT '' COMMENT '粉丝', + `works_type` varchar(100) DEFAULT '' COMMENT '工作类别', + `agency` varchar(100) DEFAULT '', + `product_main_type` varchar(255) DEFAULT '' COMMENT '类别', + `product_main_type_array` varchar(255) DEFAULT '' COMMENT '主类别', + `score` varchar(10) DEFAULT '', + `reputation_level` smallint(1) DEFAULT '0', + `special_price` varchar(10) DEFAULT '', + `join_price` varchar(10) DEFAULT '', + `bargaining` int(11) DEFAULT '0', + `duration` int(11) DEFAULT '0', + `in_business` int(11) DEFAULT '0', + `introduction` varchar(100) DEFAULT '', + `sale_type` varchar(100) DEFAULT '', + `rec_reason` varchar(255) DEFAULT '', + `daren_plaza_rec_reason` varchar(255) DEFAULT '', + `avatar` varchar(255) DEFAULT '', + `error_msg` varchar(50) DEFAULT '', + `share_url_douyin` varchar(50) DEFAULT '', + `recommend_reasons` varchar(255) DEFAULT '', + `credit_score` int(11) DEFAULT '0', + `intention_catgory` varchar(255) DEFAULT '', + `cooperate_mode` varchar(255) DEFAULT '', + `commission_ratio` varchar(255) DEFAULT '', + `sell_requirement` varchar(255) DEFAULT '', + `dark_horses` varchar(255) DEFAULT '', + `high_online_reply_rate` smallint(1) DEFAULT '0', + `high_invitation_reply_rate` smallint(1) DEFAULT '0', + `insitution_id` varchar(255) DEFAULT '', + `web_homepage_url` varchar(255) DEFAULT '', + `high_cooperation` smallint(1) DEFAULT '0', + `is_star` smallint(1) DEFAULT '0', + `tags` varchar(255) DEFAULT '', + `act_info` varchar(255) DEFAULT '', + `deduplication` varchar(100) DEFAULT '' COMMENT '去重字段', + `spider_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP COMMENT '爬虫抓取时间', + UNIQUE KEY `task_id` (`deduplication`) USING BTREE, + KEY `uid` (`uid`) USING BTREE +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC; +/*!40101 SET character_set_client = @saved_cs_client */; + -- -- Table structure for table `clean_buyin_authorStatData_seekAuthor` -- @@ -78,6 +208,8 @@ DROP TABLE IF EXISTS `clean_buyin_authorStatData_seekAuthor`; /*!50503 SET character_set_client = utf8mb4 */; CREATE TABLE `clean_buyin_authorStatData_seekAuthor` ( `task_id` varchar(100) DEFAULT NULL COMMENT '项目id', + `uid` text COMMENT 'uid', + `log_id` varchar(100) DEFAULT '', `author_base_uid` varchar(100) DEFAULT '' COMMENT '唯一标识符', `author_base_nickname` varchar(100) DEFAULT '' COMMENT '昵称', `author_base_avatar` varchar(100) DEFAULT '' COMMENT '头像', @@ -151,4 +283,4 @@ CREATE TABLE `project_buyin_authorStatData` ( /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; --- Dump completed on 2023-07-12 17:09:39 +-- Dump completed on 2023-07-13 12:09:55 diff --git a/抖音js逆向学习/抖店精选联盟数据/tool/精选联盟达人清单.py b/抖音js逆向学习/抖店精选联盟数据/tool/精选联盟达人清单.py index 9ba3226..f75464b 100644 --- a/抖音js逆向学习/抖店精选联盟数据/tool/精选联盟达人清单.py +++ b/抖音js逆向学习/抖店精选联盟数据/tool/精选联盟达人清单.py @@ -1,4 +1,5 @@ from base import Base +import pandas as pd class 精选联盟达人清单(Base): @@ -8,17 +9,69 @@ class 精选联盟达人清单(Base): self.clean_buyin_authorStatData_authorOverviewV2 = 'clean_buyin_authorStatData_authorOverviewV2' self.clean_buyin_contact_info = 'clean_buyin_contact_info' self.clean_buyin_authorStatData_seekAuthor = 'clean_buyin_authorStatData_seekAuthor' + self.clean_buyin_authorStatData_authorProfile = 'clean_buyin_authorStatData_authorProfile' - def get_excel(self): + def export_excel(self, export): + # 将字典列表转换为DataFrame + pf = pd.DataFrame(list(export)) + columns = ['抖音账户', '抖音ID', '等级LV', '粉丝数', '地址', '主推类目', '直播带货销售占比', '带货直播场次', + '带货直播观看人数', '场均销售额', '直播GPM', '视频带货销售额占比', '带货视频数量', '带货视频播放量', + '单视频销售额', '视频GPM'] + pf.columns = columns + file_path = pd.ExcelWriter('../file/name.xlsx') + # 替换空单元格 + pf.fillna(' ', inplace=True) + # 输出 + pf.to_excel(file_path, index=False) + # 保存表格 + file_path.close() + + def get_res(self): sql = f""" - select author_base_uid, author_base_nickname, author_base_fans_num, author_base_author_level from - {self.clean_buyin_authorStatData_seekAuthor}; + SELECT + nickname as '抖音账户', + account_douyin as '抖音ID', + LEVEL as '等级LV', + fans_sum as '粉丝数', + city as '地址', + product_main_type as '主推类目', + b.`直播带货销售占比`, + b.`带货直播场次`, + b.`带货直播观看人数`, + b.`场均销售额`, + b.`直播GPM`, + b.`视频带货销售额占比`, + b.`带货视频数量`, + b.`带货视频播放量`, + b.`单视频销售额`, + b.`视频GPM` + FROM + clean_buyin_authorStatData_authorProfile c + RIGHT JOIN ( + SELECT + uid, + live_data_percentage as '直播带货销售占比', + live_data_count as '带货直播场次', + live_data_watching_num as '带货直播观看人数', + concat_ws('-', live_data_sale_low, live_data_sale_high) as '场均销售额', + concat_ws('-', live_data_GPM_low, live_data_GPM_high) as '直播GPM', + video_data_percentage as '视频带货销售额占比', + video_data_count as '带货视频数量', + video_data_watching_num as '带货视频播放量', + concat_ws('-', video_data_sale_low, video_data_sale_high) as '单视频销售额', + concat_ws('-', video_data_GPM_low, video_data_sale_high) as '视频GPM' + FROM + clean_buyin_authorStatData_authorOverviewV2 + ) b ON c.uid = b.uid + ORDER BY + c.LEVEL DESC """ - res = self.eb_supports.query(sql) - print(res) + res_list = self.eb_supports.query(sql) + return res_list def run(self): - self.get_excel() + res_list = self.get_res() + self.export_excel(res_list) if __name__ == '__main__':