抖店精选联盟数据

This commit is contained in:
aiyingfeng 2023-07-11 21:00:24 +08:00
parent 30faf484dd
commit 5bff450e38
9 changed files with 190 additions and 18 deletions

View File

@ -1,3 +1,15 @@
# 文档 # 文档
mitmdump -s ./spider/buyin_authorStatData_seekAuthor_mitm.py -p 9999 -q ## 如何运行
启动代理
mitmdump -s ./main_spider.py -q -p 9999
启动浏览器
google-chrome --remote-debugging-port=9222 --user-data-dir='/home/ayf/project/js_reverse/抖音js逆向学习/抖店精选联盟数据/cn'
启动脚本控制浏览器
browser_baiyin.py

View File

@ -1,4 +1,4 @@
from dispatch.base import Base from base import Base
import datetime import datetime
import json import json

View File

@ -0,0 +1,59 @@
from base import Base
import json
class CleanBuyinAuthorStatDataSeekAutho(Base):
name = 'buyin_authorStatData_seekAuthor'
def __init__(self):
super(CleanBuyinAuthorStatDataSeekAutho, self).__init__()
self.table = self.name
self.clean_table = "clean_" + self.table
def process_item(self, resp):
list_res = []
if not resp:
self.log(f'清洗{self.table}数据-不存在')
return ''
for task_id, data, deduplication, update_time in resp:
contact_info = json.loads(data).get('contact_info')
item = {
"task_id": task_id,
"author_base_uid": deduplication.replace("uid=", ""),
"author_base_nickname": contact_info.get('times_left'),
"author_base_avatar": contact_info.get('contact_value'),
"author_base_fans_num": contact_info.get('contact_value'),
"author_base_gender": contact_info.get('contact_value'),
"author_base_city": contact_info.get('contact_value'),
"deduplication": deduplication + '&times_left=' + str(contact_info.get('times_left')),
"spider_time": update_time
}
list_res.append(item)
db_res = self.eb_supports.insert_many(self.clean_table,
list_res,
conflict=[
"task_id", "uid", "times_left",
"contact_value", "contact_value", "deduplication",
"spider_time"
]
)
if db_res >= 0:
return True, self.table, db_res
else:
return False, self.table, db_res
if __name__ == '__main__':
offset = 0
qc = CleanBuyinAuthorStatDataSeekAutho()
while True:
sql = f"""
select task_id, data, deduplication, update_time from buyin_authorStatData_seekAuthor where
date_sub(CURDATE(),INTERVAL 2 DAY) <= DATE(update_time) LIMIT 1000 OFFSET {offset};
"""
res = qc.eb_supports.query(sql)
if not res:
break
qc.process_item(res)
offset += 1000

View File

@ -0,0 +1,56 @@
from base import Base
import json
class CleanBuyinContactInfo(Base):
name = 'buyin_contact_info'
def __init__(self):
super(CleanBuyinContactInfo, self).__init__()
self.table = self.name
self.clean_table = "clean_" + self.table
def process_item(self, resp):
list_res = []
if not resp:
self.log(f'清洗{self.table}数据-不存在')
return ''
for task_id, data, deduplication, update_time in resp:
contact_info = json.loads(data).get('contact_info')
item = {
"task_id": task_id,
"uid": deduplication.replace("uid=", ""),
"times_left": contact_info.get('times_left'),
"contact_value": contact_info.get('contact_value'),
"deduplication": deduplication + '&times_left=' + str(contact_info.get('times_left')),
"spider_time": update_time
}
list_res.append(item)
db_res = self.eb_supports.insert_many(self.clean_table,
list_res,
conflict=[
"task_id", "uid", "times_left",
"contact_value", "contact_value", "deduplication",
"spider_time"
]
)
if db_res >= 0:
return True, self.table, db_res
else:
return False, self.table, db_res
if __name__ == '__main__':
offset = 0
qc = CleanBuyinContactInfo()
while True:
sql = f"""
select task_id, data, deduplication, update_time from buyin_contact_info where
date_sub(CURDATE(),INTERVAL 2 DAY) <= DATE(update_time) LIMIT 1000 OFFSET {offset};
"""
res = qc.eb_supports.query(sql)
if not res:
break
qc.process_item(res)
offset += 1000

View File

@ -1,12 +0,0 @@
from dao.mysql_dao import StoreMysqlPool
from datetime import datetime
import settings
class Base(object):
def __init__(self):
self.eb_supports = StoreMysqlPool(**settings.mysql_server_baiyin)
def log(self, s):
print('%s%s' % (datetime.now(), s), flush=True)

View File

@ -1,7 +1,7 @@
from multiprocessing import Queue from multiprocessing import Queue
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
from selenium import webdriver from selenium import webdriver
from spider.base import Base from base import Base
import time import time

View File

@ -1,4 +1,4 @@
from spider.base import Base from base import Base
from urllib.parse import parse_qsl, urlsplit from urllib.parse import parse_qsl, urlsplit
import json import json

View File

@ -25,7 +25,7 @@ DROP TABLE IF EXISTS `buyin_authorStatData_authorOverviewV2`;
CREATE TABLE `buyin_authorStatData_authorOverviewV2` ( CREATE TABLE `buyin_authorStatData_authorOverviewV2` (
`task_id` varchar(100) DEFAULT NULL COMMENT '项目id', `task_id` varchar(100) DEFAULT NULL COMMENT '项目id',
`data` mediumtext COMMENT '数据结果', `data` mediumtext COMMENT '数据结果',
`deduplication` varchar(100) DEFAULT '' COMMENT '去重字段', `deduplication` varchar(150) DEFAULT '' COMMENT '去重字段',
`status` smallint(6) DEFAULT '0' COMMENT '状态', `status` smallint(6) DEFAULT '0' COMMENT '状态',
`create_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP, `create_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
`update_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, `update_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
@ -50,6 +50,63 @@ CREATE TABLE `buyin_authorStatData_seekAuthor` (
UNIQUE KEY `task_id` (`task_id`,`deduplication`) USING BTREE UNIQUE KEY `task_id` (`task_id`,`deduplication`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC; ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC;
/*!40101 SET character_set_client = @saved_cs_client */; /*!40101 SET character_set_client = @saved_cs_client */;
--
-- Table structure for table `buyin_contact_info`
--
DROP TABLE IF EXISTS `buyin_contact_info`;
/*!40101 SET @saved_cs_client = @@character_set_client */;
/*!50503 SET character_set_client = utf8mb4 */;
CREATE TABLE `buyin_contact_info` (
`task_id` varchar(100) DEFAULT NULL COMMENT '项目id',
`data` mediumtext COMMENT '数据结果',
`deduplication` varchar(100) DEFAULT '' COMMENT '去重字段',
`status` smallint(6) DEFAULT '0' COMMENT '状态',
`create_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
`update_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
KEY `task_id` (`task_id`,`deduplication`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC;
/*!40101 SET character_set_client = @saved_cs_client */;
--
-- Table structure for table `clean_buyin_contact_info`
--
DROP TABLE IF EXISTS `clean_buyin_contact_info`;
/*!40101 SET @saved_cs_client = @@character_set_client */;
/*!50503 SET character_set_client = utf8mb4 */;
CREATE TABLE `clean_buyin_contact_info` (
`task_id` varchar(100) DEFAULT NULL COMMENT '项目id',
`uid` varchar(100) DEFAULT '' COMMENT '唯一标识符',
`times_left` varchar(50) DEFAULT '',
`contact_value` varchar(50) DEFAULT '' COMMENT '联系方式',
`deduplication` varchar(100) DEFAULT '' COMMENT '去重字段',
`spider_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP COMMENT '爬虫抓取时间',
UNIQUE KEY `task_id` (`deduplication`) USING BTREE,
KEY `uid` (`uid`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC;
/*!40101 SET character_set_client = @saved_cs_client */;
--
-- Table structure for table `project_buyin_authorStatData`
--
DROP TABLE IF EXISTS `project_buyin_authorStatData`;
/*!40101 SET @saved_cs_client = @@character_set_client */;
/*!50503 SET character_set_client = utf8mb4 */;
CREATE TABLE `project_buyin_authorStatData` (
`task_id` varchar(100) DEFAULT NULL COMMENT '项目id',
`payload_get` text COMMENT 'get请求参数',
`payload_post` varchar(255) DEFAULT '' COMMENT 'post请求参数',
`deduplication` varchar(150) DEFAULT '' COMMENT '去重字段',
`weight` tinyint(4) DEFAULT NULL COMMENT '权重',
`status` tinyint(1) DEFAULT '0',
`create_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
`update_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
UNIQUE KEY `task_id` (`task_id`,`deduplication`) USING BTREE
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC;
/*!40101 SET character_set_client = @saved_cs_client */;
/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
/*!40101 SET SQL_MODE=@OLD_SQL_MODE */; /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
@ -60,4 +117,4 @@ CREATE TABLE `buyin_authorStatData_seekAuthor` (
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
-- Dump completed on 2023-07-11 10:53:37 -- Dump completed on 2023-07-11 20:41:43