抖店精选联盟数据

This commit is contained in:
aiyingfeng 2023-07-11 21:00:24 +08:00
parent 30faf484dd
commit 5bff450e38
9 changed files with 190 additions and 18 deletions

View File

@ -1,3 +1,15 @@
# 文档
mitmdump -s ./spider/buyin_authorStatData_seekAuthor_mitm.py -p 9999 -q
## 如何运行
启动代理
mitmdump -s ./main_spider.py -q -p 9999
启动浏览器
google-chrome --remote-debugging-port=9222 --user-data-dir='/home/ayf/project/js_reverse/抖音js逆向学习/抖店精选联盟数据/cn'
启动脚本控制浏览器
browser_baiyin.py

View File

@ -1,4 +1,4 @@
from dispatch.base import Base
from base import Base
import datetime
import json

View File

@ -0,0 +1,59 @@
from base import Base
import json
class CleanBuyinAuthorStatDataSeekAutho(Base):
name = 'buyin_authorStatData_seekAuthor'
def __init__(self):
super(CleanBuyinAuthorStatDataSeekAutho, self).__init__()
self.table = self.name
self.clean_table = "clean_" + self.table
def process_item(self, resp):
list_res = []
if not resp:
self.log(f'清洗{self.table}数据-不存在')
return ''
for task_id, data, deduplication, update_time in resp:
contact_info = json.loads(data).get('contact_info')
item = {
"task_id": task_id,
"author_base_uid": deduplication.replace("uid=", ""),
"author_base_nickname": contact_info.get('times_left'),
"author_base_avatar": contact_info.get('contact_value'),
"author_base_fans_num": contact_info.get('contact_value'),
"author_base_gender": contact_info.get('contact_value'),
"author_base_city": contact_info.get('contact_value'),
"deduplication": deduplication + '&times_left=' + str(contact_info.get('times_left')),
"spider_time": update_time
}
list_res.append(item)
db_res = self.eb_supports.insert_many(self.clean_table,
list_res,
conflict=[
"task_id", "uid", "times_left",
"contact_value", "contact_value", "deduplication",
"spider_time"
]
)
if db_res >= 0:
return True, self.table, db_res
else:
return False, self.table, db_res
if __name__ == '__main__':
offset = 0
qc = CleanBuyinAuthorStatDataSeekAutho()
while True:
sql = f"""
select task_id, data, deduplication, update_time from buyin_authorStatData_seekAuthor where
date_sub(CURDATE(),INTERVAL 2 DAY) <= DATE(update_time) LIMIT 1000 OFFSET {offset};
"""
res = qc.eb_supports.query(sql)
if not res:
break
qc.process_item(res)
offset += 1000

View File

@ -0,0 +1,56 @@
from base import Base
import json
class CleanBuyinContactInfo(Base):
name = 'buyin_contact_info'
def __init__(self):
super(CleanBuyinContactInfo, self).__init__()
self.table = self.name
self.clean_table = "clean_" + self.table
def process_item(self, resp):
list_res = []
if not resp:
self.log(f'清洗{self.table}数据-不存在')
return ''
for task_id, data, deduplication, update_time in resp:
contact_info = json.loads(data).get('contact_info')
item = {
"task_id": task_id,
"uid": deduplication.replace("uid=", ""),
"times_left": contact_info.get('times_left'),
"contact_value": contact_info.get('contact_value'),
"deduplication": deduplication + '&times_left=' + str(contact_info.get('times_left')),
"spider_time": update_time
}
list_res.append(item)
db_res = self.eb_supports.insert_many(self.clean_table,
list_res,
conflict=[
"task_id", "uid", "times_left",
"contact_value", "contact_value", "deduplication",
"spider_time"
]
)
if db_res >= 0:
return True, self.table, db_res
else:
return False, self.table, db_res
if __name__ == '__main__':
offset = 0
qc = CleanBuyinContactInfo()
while True:
sql = f"""
select task_id, data, deduplication, update_time from buyin_contact_info where
date_sub(CURDATE(),INTERVAL 2 DAY) <= DATE(update_time) LIMIT 1000 OFFSET {offset};
"""
res = qc.eb_supports.query(sql)
if not res:
break
qc.process_item(res)
offset += 1000

View File

@ -1,12 +0,0 @@
from dao.mysql_dao import StoreMysqlPool
from datetime import datetime
import settings
class Base(object):
def __init__(self):
self.eb_supports = StoreMysqlPool(**settings.mysql_server_baiyin)
def log(self, s):
print('%s%s' % (datetime.now(), s), flush=True)

View File

@ -1,7 +1,7 @@
from multiprocessing import Queue
from selenium.webdriver.common.by import By
from selenium import webdriver
from spider.base import Base
from base import Base
import time

View File

@ -1,4 +1,4 @@
from spider.base import Base
from base import Base
from urllib.parse import parse_qsl, urlsplit
import json

View File

@ -25,7 +25,7 @@ DROP TABLE IF EXISTS `buyin_authorStatData_authorOverviewV2`;
CREATE TABLE `buyin_authorStatData_authorOverviewV2` (
`task_id` varchar(100) DEFAULT NULL COMMENT '项目id',
`data` mediumtext COMMENT '数据结果',
`deduplication` varchar(100) DEFAULT '' COMMENT '去重字段',
`deduplication` varchar(150) DEFAULT '' COMMENT '去重字段',
`status` smallint(6) DEFAULT '0' COMMENT '状态',
`create_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
`update_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
@ -50,6 +50,63 @@ CREATE TABLE `buyin_authorStatData_seekAuthor` (
UNIQUE KEY `task_id` (`task_id`,`deduplication`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC;
/*!40101 SET character_set_client = @saved_cs_client */;
--
-- Table structure for table `buyin_contact_info`
--
DROP TABLE IF EXISTS `buyin_contact_info`;
/*!40101 SET @saved_cs_client = @@character_set_client */;
/*!50503 SET character_set_client = utf8mb4 */;
CREATE TABLE `buyin_contact_info` (
`task_id` varchar(100) DEFAULT NULL COMMENT '项目id',
`data` mediumtext COMMENT '数据结果',
`deduplication` varchar(100) DEFAULT '' COMMENT '去重字段',
`status` smallint(6) DEFAULT '0' COMMENT '状态',
`create_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
`update_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
KEY `task_id` (`task_id`,`deduplication`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC;
/*!40101 SET character_set_client = @saved_cs_client */;
--
-- Table structure for table `clean_buyin_contact_info`
--
DROP TABLE IF EXISTS `clean_buyin_contact_info`;
/*!40101 SET @saved_cs_client = @@character_set_client */;
/*!50503 SET character_set_client = utf8mb4 */;
CREATE TABLE `clean_buyin_contact_info` (
`task_id` varchar(100) DEFAULT NULL COMMENT '项目id',
`uid` varchar(100) DEFAULT '' COMMENT '唯一标识符',
`times_left` varchar(50) DEFAULT '',
`contact_value` varchar(50) DEFAULT '' COMMENT '联系方式',
`deduplication` varchar(100) DEFAULT '' COMMENT '去重字段',
`spider_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP COMMENT '爬虫抓取时间',
UNIQUE KEY `task_id` (`deduplication`) USING BTREE,
KEY `uid` (`uid`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC;
/*!40101 SET character_set_client = @saved_cs_client */;
--
-- Table structure for table `project_buyin_authorStatData`
--
DROP TABLE IF EXISTS `project_buyin_authorStatData`;
/*!40101 SET @saved_cs_client = @@character_set_client */;
/*!50503 SET character_set_client = utf8mb4 */;
CREATE TABLE `project_buyin_authorStatData` (
`task_id` varchar(100) DEFAULT NULL COMMENT '项目id',
`payload_get` text COMMENT 'get请求参数',
`payload_post` varchar(255) DEFAULT '' COMMENT 'post请求参数',
`deduplication` varchar(150) DEFAULT '' COMMENT '去重字段',
`weight` tinyint(4) DEFAULT NULL COMMENT '权重',
`status` tinyint(1) DEFAULT '0',
`create_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
`update_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
UNIQUE KEY `task_id` (`task_id`,`deduplication`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC;
/*!40101 SET character_set_client = @saved_cs_client */;
/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
/*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
@ -60,4 +117,4 @@ CREATE TABLE `buyin_authorStatData_seekAuthor` (
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
-- Dump completed on 2023-07-11 10:53:37
-- Dump completed on 2023-07-11 20:41:43