抖店精选联盟数据

This commit is contained in:
aiyingfeng 2023-07-13 20:51:45 +08:00
parent d4dbfd4ad2
commit 6afa60b5d6
6 changed files with 319 additions and 27 deletions

View File

@ -1,6 +1,5 @@
from base import Base
import datetime
import json
class 创建巨量百应主播详情爬虫(Base):
@ -44,17 +43,17 @@ if __name__ == '__main__':
while True:
sql = f"""
SELECT
data, deduplication
uid,
log_id
FROM
buyin_authorStatData_seekAuthor
clean_buyin_authorStatData_seekAuthor
WHERE
author_base_uid NOT IN (SELECT uid FROM clean_buyin_contact_info)
LIMIT 1000 OFFSET {offset}
"""
msg = d.eb_supports.query(sql)
list_dict = []
for data, deduplication in msg:
data = json.loads(data)
log_id = deduplication.split('&')[0]
uid = data.get('author_base').get('uid')
for uid, log_id in msg:
item = {"task_id": task_id, "uid": uid, "log_id": log_id}
list_dict.append(item)
if list_dict:

View File

@ -0,0 +1,105 @@
from base import Base
import json
class CleanBuyinAuthorStatDataAuthorProfile(Base):
name = 'buyin_authorStatData_authorProfile'
def __init__(self):
super(CleanBuyinAuthorStatDataAuthorProfile, self).__init__()
self.table = self.name
self.clean_table = "clean_" + self.table
def process_item(self, resp):
list_res = []
if not resp:
self.log(f'清洗{self.table}数据-不存在')
return ''
for task_id, data, deduplication, update_time in resp:
data = json.loads(data)
item = {
"task_id": task_id,
"uid": deduplication.replace("uid=", ""),
"level": data.get('level'),
"account_douyin": data.get('account_douyin'),
"gender": data.get('gender'),
"city": data.get('city'),
"bind_lark_status": 1 if data.get('bind_lark_status') else 0,
"nickname": data.get('nickname'),
"fans_sum": data.get('fans_sum'),
"works_type": str(data.get('works_type')) if data.get('works_type') else '',
"agency": data.get('agency'),
"product_main_type": str(data.get('product_main_type')) if data.get('product_main_type') else '',
"product_main_type_array": str(data.get('product_main_type_array')) if data.get('product_main_type_array') else '',
"score": data.get('score'),
"reputation_level": data.get('reputation_level'),
"special_price": data.get('special_price'),
"join_price": data.get('join_price'),
"bargaining": data.get('bargaining'),
"duration": data.get('duration'),
"in_business": data.get('in_business'),
"introduction": data.get('introduction'),
"sale_type": data.get('sale_type'),
"rec_reason": str(data.get('rec_reason')) if data.get('rec_reason') else '',
"daren_plaza_rec_reason": str(data.get('daren_plaza_rec_reason')) if data.get('daren_plaza_rec_reason') else '',
"avatar": data.get('avatar'),
"error_msg": data.get('error_msg'),
"share_url_douyin": data.get('share_url_douyin'),
"recommend_reasons": str(data.get('recommend_reasons')) if data.get('recommend_reasons') else '',
"credit_score": data.get('credit_score'),
"intention_catgory": str(data.get('intention_catgory')) if data.get('intention_catgory') else '',
"cooperate_mode": data.get('cooperate_mode'),
"commission_ratio": data.get('commission_ratio'),
"sell_requirement": str(data.get('sell_requirement')) if data.get('sell_requirement') else '',
"dark_horses": str(data.get('dark_horses')) if data.get('dark_horses') else '',
"high_online_reply_rate": 1 if data.get('high_online_reply_rate') else 0,
"high_invitation_reply_rate": 1 if data.get('high_invitation_reply_rate') else 0,
"insitution_id": data.get('insitution_id'),
"web_homepage_url": data.get('web_homepage_url'),
"high_cooperation": 1 if data.get('high_cooperation') else 0,
"is_star": 1 if data.get('is_star') else 0,
"tags": str(data.get('tags')) if data.get('tags') else '',
"act_info": data.get('act_info') if data.get('act_info') else '',
"deduplication": deduplication,
"spider_time": update_time
}
list_res.append(item)
db_res = self.eb_supports.insert_many(self.clean_table,
list_res,
conflict=[
"task_id", "uid", "level", "account_douyin", "gender",
"city", "bind_lark_status", "nickname", "fans_sum",
"works_type", "agency", "product_main_type",
"product_main_type_array", "score", "reputation_level",
"special_price", "join_price",
"bargaining", "duration", "in_business", "introduction",
"sale_type", "rec_reason", "daren_plaza_rec_reason", "avatar",
"error_msg", "share_url_douyin", "recommend_reasons", "credit_score",
"intention_catgory", "cooperate_mode", "commission_ratio",
"sell_requirement",
"sell_requirement", "dark_horses", "high_online_reply_rate",
"high_invitation_reply_rate", "high_invitation_reply_rate",
"insitution_id", "web_homepage_url", "high_cooperation",
"is_star", "tags", "act_info", "deduplication", "spider_time"
]
)
if db_res >= 0:
return True, self.table, db_res
else:
return False, self.table, db_res
if __name__ == '__main__':
offset = 0
qc = CleanBuyinAuthorStatDataAuthorProfile()
while True:
sql = f"""
select task_id, data, deduplication, update_time from buyin_authorStatData_authorProfile where
date_sub(CURDATE(),INTERVAL 5 DAY) <= DATE(update_time) LIMIT 1000 OFFSET {offset};
"""
res = qc.eb_supports.query(sql)
if not res:
break
qc.process_item(res)
offset += 1000

View File

@ -20,8 +20,11 @@ class CleanBuyinAuthorStatDataSeekAutho(Base):
author_base = json.loads(data).get('author_base')
author_tag = json.loads(data).get('author_tag')
uid = deduplication.split('&')[1]
log_id = deduplication.split('&')[0]
item = {
"task_id": task_id,
"uid": author_base.get('uid'),
"log_id": log_id,
"author_base_uid": uid.replace("uid=", ""),
"author_base_nickname": author_base.get('nickname'),
"author_base_avatar": author_base.get('avatar'),
@ -47,7 +50,7 @@ class CleanBuyinAuthorStatDataSeekAutho(Base):
db_res = self.eb_supports.insert_many(self.clean_table,
list_res,
conflict=[
"task_id", "author_base_uid", "author_base_nickname",
"task_id", "uid", "log_id", "author_base_uid", "author_base_nickname",
"author_base_avatar", "author_base_fans_num",
"author_base_gender", "author_base_city",
"author_base_author_level", "author_base_avatar_big",

View File

@ -57,18 +57,18 @@ class BrowserBaiyin(Base):
deduplication = project_item[3]
self.browser.get(payload_get)
time.sleep(15)
# elements_img = self.browser.find_elements(By.XPATH, '//div[@class="contact_way_info_block_item"]'
# '//img[@elementtiming="element-timing"]')
# elements_img[0].click()
# time.sleep(5)
# elements_ck = self.browser.find_elements(By.XPATH, '//button[@class="auxo-btn auxo-btn-primary"]'
# '/span[text()="查看"]')
# if elements_ck:
# elements_ck[0].click()
#
# if len(elements_img) > 1:
# elements_img[1].click()
# time.sleep(5)
elements_img = self.browser.find_elements(By.XPATH, '//div[@class="contact_way_info_block_item"]'
'//img[@elementtiming="element-timing"]')
elements_img[0].click()
time.sleep(5)
elements_ck = self.browser.find_elements(By.XPATH, '//button[@class="auxo-btn auxo-btn-primary"]'
'/span[text()="查看"]')
if elements_ck:
elements_ck[0].click()
if len(elements_img) > 1:
elements_img[1].click()
time.sleep(5)
sql = f"SELECT task_id FROM {self.table} where deduplication = '{deduplication}' limit 1"
msg = self.eb_supports.query(sql)
if msg:

View File

@ -33,6 +33,24 @@ CREATE TABLE `buyin_authorStatData_authorOverviewV2` (
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC;
/*!40101 SET character_set_client = @saved_cs_client */;
--
-- Table structure for table `buyin_authorStatData_authorProfile`
--
DROP TABLE IF EXISTS `buyin_authorStatData_authorProfile`;
/*!40101 SET @saved_cs_client = @@character_set_client */;
/*!50503 SET character_set_client = utf8mb4 */;
CREATE TABLE `buyin_authorStatData_authorProfile` (
`task_id` varchar(100) DEFAULT NULL COMMENT '项目id',
`data` mediumtext COMMENT '数据结果',
`deduplication` varchar(100) DEFAULT '' COMMENT '去重字段',
`status` smallint(6) DEFAULT '0' COMMENT '状态',
`create_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
`update_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
KEY `task_id` (`task_id`,`deduplication`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC;
/*!40101 SET character_set_client = @saved_cs_client */;
--
-- Table structure for table `buyin_authorStatData_seekAuthor`
--
@ -69,6 +87,118 @@ CREATE TABLE `buyin_contact_info` (
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC;
/*!40101 SET character_set_client = @saved_cs_client */;
--
-- Table structure for table `clean_buyin_authorStatData_authorOverviewV2`
--
DROP TABLE IF EXISTS `clean_buyin_authorStatData_authorOverviewV2`;
/*!40101 SET @saved_cs_client = @@character_set_client */;
/*!50503 SET character_set_client = utf8mb4 */;
CREATE TABLE `clean_buyin_authorStatData_authorOverviewV2` (
`task_id` varchar(100) DEFAULT NULL COMMENT '项目id',
`uid` varchar(100) DEFAULT '' COMMENT '唯一标识符',
`live_data_percentage` int(11) DEFAULT '0',
`live_data_count` int(11) DEFAULT '0',
`live_data_watching_num` int(11) DEFAULT '0',
`live_data_sale_low` int(11) DEFAULT '0',
`live_data_sale_high` int(11) DEFAULT '0',
`live_data_sale_value` int(11) DEFAULT '0',
`live_data_GPM_low` int(11) DEFAULT '0',
`live_data_GPM_high` int(11) DEFAULT '0',
`live_data_GPM_value` int(11) DEFAULT '0',
`live_data_sale_status` int(11) DEFAULT '0',
`live_data_GPM_status` int(11) DEFAULT '0',
`live_data_recommend_rate` int(11) DEFAULT '0',
`live_data_high_recommend_rate` int(11) DEFAULT '0',
`live_data_high_live_day` int(11) DEFAULT '0',
`live_data_high_live_product_num` int(11) DEFAULT '0',
`live_data_high_avg_live_online_dur` int(11) DEFAULT '0',
`live_data_high_avg_live_avg_peru_watch_dur` int(11) DEFAULT '0',
`live_data_high_avg_live_med_peru_watch_dur` int(11) DEFAULT '0',
`live_data_high_avg_live_avg_price` int(11) DEFAULT '0',
`video_data_percentage` int(11) DEFAULT '0',
`video_data_count` int(11) DEFAULT '0',
`video_data_watching_num` int(11) DEFAULT '0',
`video_data_sale_low` int(11) DEFAULT '0',
`video_data_sale_high` int(11) DEFAULT '0',
`video_data_sale_value` int(11) DEFAULT '0',
`video_data_GPM_low` int(11) DEFAULT '0',
`video_data_GPM_high` int(11) DEFAULT '0',
`video_data_GPM_value` int(11) DEFAULT '0',
`video_data_sale_status` int(11) DEFAULT '0',
`video_data_GPM_status` int(11) DEFAULT '0',
`video_data_recommend_rate` int(11) DEFAULT '0',
`video_data_high_recommend_rate` int(11) DEFAULT '0',
`video_data_live_day` int(11) DEFAULT '0',
`video_data_live_product_num` int(11) DEFAULT '0',
`video_data_avg_live_online_dur` int(11) DEFAULT '0',
`video_data_live_avg_peru_watch_dur` int(11) DEFAULT '0',
`video_data_live_med_peru_watch_dur` int(11) DEFAULT '0',
`video_data_live_avg_price` int(11) DEFAULT '0',
`deduplication` varchar(100) DEFAULT '' COMMENT '去重字段',
`spider_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP COMMENT '爬虫抓取时间',
UNIQUE KEY `deduplication` (`deduplication`) USING BTREE,
KEY `uid` (`uid`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC;
/*!40101 SET character_set_client = @saved_cs_client */;
--
-- Table structure for table `clean_buyin_authorStatData_authorProfile`
--
DROP TABLE IF EXISTS `clean_buyin_authorStatData_authorProfile`;
/*!40101 SET @saved_cs_client = @@character_set_client */;
/*!50503 SET character_set_client = utf8mb4 */;
CREATE TABLE `clean_buyin_authorStatData_authorProfile` (
`task_id` varchar(100) DEFAULT NULL COMMENT '项目id',
`uid` varchar(100) DEFAULT '' COMMENT '唯一标识符',
`level` smallint(1) DEFAULT '0' COMMENT '等级',
`account_douyin` varchar(50) DEFAULT '' COMMENT 'douyin_id',
`gender` smallint(1) DEFAULT '0' COMMENT '性别',
`city` varchar(50) DEFAULT '' COMMENT '城市',
`bind_lark_status` smallint(1) DEFAULT '0',
`nickname` varchar(50) DEFAULT '' COMMENT '昵称',
`fans_sum` varchar(50) DEFAULT '' COMMENT '粉丝',
`works_type` varchar(100) DEFAULT '' COMMENT '工作类别',
`agency` varchar(100) DEFAULT '',
`product_main_type` varchar(255) DEFAULT '' COMMENT '类别',
`product_main_type_array` varchar(255) DEFAULT '' COMMENT '主类别',
`score` varchar(10) DEFAULT '',
`reputation_level` smallint(1) DEFAULT '0',
`special_price` varchar(10) DEFAULT '',
`join_price` varchar(10) DEFAULT '',
`bargaining` int(11) DEFAULT '0',
`duration` int(11) DEFAULT '0',
`in_business` int(11) DEFAULT '0',
`introduction` varchar(100) DEFAULT '',
`sale_type` varchar(100) DEFAULT '',
`rec_reason` varchar(255) DEFAULT '',
`daren_plaza_rec_reason` varchar(255) DEFAULT '',
`avatar` varchar(255) DEFAULT '',
`error_msg` varchar(50) DEFAULT '',
`share_url_douyin` varchar(50) DEFAULT '',
`recommend_reasons` varchar(255) DEFAULT '',
`credit_score` int(11) DEFAULT '0',
`intention_catgory` varchar(255) DEFAULT '',
`cooperate_mode` varchar(255) DEFAULT '',
`commission_ratio` varchar(255) DEFAULT '',
`sell_requirement` varchar(255) DEFAULT '',
`dark_horses` varchar(255) DEFAULT '',
`high_online_reply_rate` smallint(1) DEFAULT '0',
`high_invitation_reply_rate` smallint(1) DEFAULT '0',
`insitution_id` varchar(255) DEFAULT '',
`web_homepage_url` varchar(255) DEFAULT '',
`high_cooperation` smallint(1) DEFAULT '0',
`is_star` smallint(1) DEFAULT '0',
`tags` varchar(255) DEFAULT '',
`act_info` varchar(255) DEFAULT '',
`deduplication` varchar(100) DEFAULT '' COMMENT '去重字段',
`spider_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP COMMENT '爬虫抓取时间',
UNIQUE KEY `task_id` (`deduplication`) USING BTREE,
KEY `uid` (`uid`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC;
/*!40101 SET character_set_client = @saved_cs_client */;
--
-- Table structure for table `clean_buyin_authorStatData_seekAuthor`
--
@ -78,6 +208,8 @@ DROP TABLE IF EXISTS `clean_buyin_authorStatData_seekAuthor`;
/*!50503 SET character_set_client = utf8mb4 */;
CREATE TABLE `clean_buyin_authorStatData_seekAuthor` (
`task_id` varchar(100) DEFAULT NULL COMMENT '项目id',
`uid` text COMMENT 'uid',
`log_id` varchar(100) DEFAULT '',
`author_base_uid` varchar(100) DEFAULT '' COMMENT '唯一标识符',
`author_base_nickname` varchar(100) DEFAULT '' COMMENT '昵称',
`author_base_avatar` varchar(100) DEFAULT '' COMMENT '头像',
@ -151,4 +283,4 @@ CREATE TABLE `project_buyin_authorStatData` (
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
-- Dump completed on 2023-07-12 17:09:39
-- Dump completed on 2023-07-13 12:09:55

View File

@ -1,4 +1,5 @@
from base import Base
import pandas as pd
class 精选联盟达人清单(Base):
@ -8,17 +9,69 @@ class 精选联盟达人清单(Base):
self.clean_buyin_authorStatData_authorOverviewV2 = 'clean_buyin_authorStatData_authorOverviewV2'
self.clean_buyin_contact_info = 'clean_buyin_contact_info'
self.clean_buyin_authorStatData_seekAuthor = 'clean_buyin_authorStatData_seekAuthor'
self.clean_buyin_authorStatData_authorProfile = 'clean_buyin_authorStatData_authorProfile'
def get_excel(self):
def export_excel(self, export):
# 将字典列表转换为DataFrame
pf = pd.DataFrame(list(export))
columns = ['抖音账户', '抖音ID', '等级LV', '粉丝数', '地址', '主推类目', '直播带货销售占比', '带货直播场次',
'带货直播观看人数', '场均销售额', '直播GPM', '视频带货销售额占比', '带货视频数量', '带货视频播放量',
'单视频销售额', '视频GPM']
pf.columns = columns
file_path = pd.ExcelWriter('../file/name.xlsx')
# 替换空单元格
pf.fillna(' ', inplace=True)
# 输出
pf.to_excel(file_path, index=False)
# 保存表格
file_path.close()
def get_res(self):
sql = f"""
select author_base_uid, author_base_nickname, author_base_fans_num, author_base_author_level from
{self.clean_buyin_authorStatData_seekAuthor};
SELECT
nickname as '抖音账户',
account_douyin as '抖音ID',
LEVEL as '等级LV',
fans_sum as '粉丝数',
city as '地址',
product_main_type as '主推类目',
b.`直播带货销售占比`,
b.`带货直播场次`,
b.`带货直播观看人数`,
b.`场均销售额`,
b.`直播GPM`,
b.`视频带货销售额占比`,
b.`带货视频数量`,
b.`带货视频播放量`,
b.`单视频销售额`,
b.`视频GPM`
FROM
clean_buyin_authorStatData_authorProfile c
RIGHT JOIN (
SELECT
uid,
live_data_percentage as '直播带货销售占比',
live_data_count as '带货直播场次',
live_data_watching_num as '带货直播观看人数',
concat_ws('-', live_data_sale_low, live_data_sale_high) as '场均销售额',
concat_ws('-', live_data_GPM_low, live_data_GPM_high) as '直播GPM',
video_data_percentage as '视频带货销售额占比',
video_data_count as '带货视频数量',
video_data_watching_num as '带货视频播放量',
concat_ws('-', video_data_sale_low, video_data_sale_high) as '单视频销售额',
concat_ws('-', video_data_GPM_low, video_data_sale_high) as '视频GPM'
FROM
clean_buyin_authorStatData_authorOverviewV2
) b ON c.uid = b.uid
ORDER BY
c.LEVEL DESC
"""
res = self.eb_supports.query(sql)
print(res)
res_list = self.eb_supports.query(sql)
return res_list
def run(self):
self.get_excel()
res_list = self.get_res()
self.export_excel(res_list)
if __name__ == '__main__':