mirror of
https://github.com/luzhisheng/js_reverse.git
synced 2025-04-22 12:13:58 +08:00
抖店精选联盟数据
This commit is contained in:
parent
30faf484dd
commit
5bff450e38
@ -1,3 +1,15 @@
|
||||
# 文档
|
||||
|
||||
mitmdump -s ./spider/buyin_authorStatData_seekAuthor_mitm.py -p 9999 -q
|
||||
## 如何运行
|
||||
|
||||
启动代理
|
||||
|
||||
mitmdump -s ./main_spider.py -q -p 9999
|
||||
|
||||
启动浏览器
|
||||
|
||||
google-chrome --remote-debugging-port=9222 --user-data-dir='/home/ayf/project/js_reverse/抖音js逆向学习/抖店精选联盟数据/cn'
|
||||
|
||||
启动脚本控制浏览器
|
||||
|
||||
browser_baiyin.py
|
@ -1,4 +1,4 @@
|
||||
from dispatch.base import Base
|
||||
from base import Base
|
||||
import datetime
|
||||
import json
|
||||
|
||||
|
@ -0,0 +1,59 @@
|
||||
from base import Base
|
||||
import json
|
||||
|
||||
|
||||
class CleanBuyinAuthorStatDataSeekAutho(Base):
|
||||
name = 'buyin_authorStatData_seekAuthor'
|
||||
|
||||
def __init__(self):
|
||||
super(CleanBuyinAuthorStatDataSeekAutho, self).__init__()
|
||||
self.table = self.name
|
||||
self.clean_table = "clean_" + self.table
|
||||
|
||||
def process_item(self, resp):
|
||||
list_res = []
|
||||
if not resp:
|
||||
self.log(f'清洗{self.table}数据-不存在')
|
||||
return ''
|
||||
|
||||
for task_id, data, deduplication, update_time in resp:
|
||||
contact_info = json.loads(data).get('contact_info')
|
||||
item = {
|
||||
"task_id": task_id,
|
||||
"author_base_uid": deduplication.replace("uid=", ""),
|
||||
"author_base_nickname": contact_info.get('times_left'),
|
||||
"author_base_avatar": contact_info.get('contact_value'),
|
||||
"author_base_fans_num": contact_info.get('contact_value'),
|
||||
"author_base_gender": contact_info.get('contact_value'),
|
||||
"author_base_city": contact_info.get('contact_value'),
|
||||
"deduplication": deduplication + '×_left=' + str(contact_info.get('times_left')),
|
||||
"spider_time": update_time
|
||||
}
|
||||
list_res.append(item)
|
||||
db_res = self.eb_supports.insert_many(self.clean_table,
|
||||
list_res,
|
||||
conflict=[
|
||||
"task_id", "uid", "times_left",
|
||||
"contact_value", "contact_value", "deduplication",
|
||||
"spider_time"
|
||||
]
|
||||
)
|
||||
if db_res >= 0:
|
||||
return True, self.table, db_res
|
||||
else:
|
||||
return False, self.table, db_res
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
offset = 0
|
||||
qc = CleanBuyinAuthorStatDataSeekAutho()
|
||||
while True:
|
||||
sql = f"""
|
||||
select task_id, data, deduplication, update_time from buyin_authorStatData_seekAuthor where
|
||||
date_sub(CURDATE(),INTERVAL 2 DAY) <= DATE(update_time) LIMIT 1000 OFFSET {offset};
|
||||
"""
|
||||
res = qc.eb_supports.query(sql)
|
||||
if not res:
|
||||
break
|
||||
qc.process_item(res)
|
||||
offset += 1000
|
56
抖音js逆向学习/抖店精选联盟数据/extractors/clean_buyin_contact_info.py
Normal file
56
抖音js逆向学习/抖店精选联盟数据/extractors/clean_buyin_contact_info.py
Normal file
@ -0,0 +1,56 @@
|
||||
from base import Base
|
||||
import json
|
||||
|
||||
|
||||
class CleanBuyinContactInfo(Base):
|
||||
name = 'buyin_contact_info'
|
||||
|
||||
def __init__(self):
|
||||
super(CleanBuyinContactInfo, self).__init__()
|
||||
self.table = self.name
|
||||
self.clean_table = "clean_" + self.table
|
||||
|
||||
def process_item(self, resp):
|
||||
list_res = []
|
||||
if not resp:
|
||||
self.log(f'清洗{self.table}数据-不存在')
|
||||
return ''
|
||||
|
||||
for task_id, data, deduplication, update_time in resp:
|
||||
contact_info = json.loads(data).get('contact_info')
|
||||
item = {
|
||||
"task_id": task_id,
|
||||
"uid": deduplication.replace("uid=", ""),
|
||||
"times_left": contact_info.get('times_left'),
|
||||
"contact_value": contact_info.get('contact_value'),
|
||||
"deduplication": deduplication + '×_left=' + str(contact_info.get('times_left')),
|
||||
"spider_time": update_time
|
||||
}
|
||||
list_res.append(item)
|
||||
db_res = self.eb_supports.insert_many(self.clean_table,
|
||||
list_res,
|
||||
conflict=[
|
||||
"task_id", "uid", "times_left",
|
||||
"contact_value", "contact_value", "deduplication",
|
||||
"spider_time"
|
||||
]
|
||||
)
|
||||
if db_res >= 0:
|
||||
return True, self.table, db_res
|
||||
else:
|
||||
return False, self.table, db_res
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
offset = 0
|
||||
qc = CleanBuyinContactInfo()
|
||||
while True:
|
||||
sql = f"""
|
||||
select task_id, data, deduplication, update_time from buyin_contact_info where
|
||||
date_sub(CURDATE(),INTERVAL 2 DAY) <= DATE(update_time) LIMIT 1000 OFFSET {offset};
|
||||
"""
|
||||
res = qc.eb_supports.query(sql)
|
||||
if not res:
|
||||
break
|
||||
qc.process_item(res)
|
||||
offset += 1000
|
@ -1,12 +0,0 @@
|
||||
from dao.mysql_dao import StoreMysqlPool
|
||||
from datetime import datetime
|
||||
import settings
|
||||
|
||||
|
||||
class Base(object):
|
||||
|
||||
def __init__(self):
|
||||
self.eb_supports = StoreMysqlPool(**settings.mysql_server_baiyin)
|
||||
|
||||
def log(self, s):
|
||||
print('【%s】 %s' % (datetime.now(), s), flush=True)
|
@ -1,7 +1,7 @@
|
||||
from multiprocessing import Queue
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium import webdriver
|
||||
from spider.base import Base
|
||||
from base import Base
|
||||
import time
|
||||
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
from spider.base import Base
|
||||
from base import Base
|
||||
from urllib.parse import parse_qsl, urlsplit
|
||||
import json
|
||||
|
||||
|
@ -25,7 +25,7 @@ DROP TABLE IF EXISTS `buyin_authorStatData_authorOverviewV2`;
|
||||
CREATE TABLE `buyin_authorStatData_authorOverviewV2` (
|
||||
`task_id` varchar(100) DEFAULT NULL COMMENT '项目id',
|
||||
`data` mediumtext COMMENT '数据结果',
|
||||
`deduplication` varchar(100) DEFAULT '' COMMENT '去重字段',
|
||||
`deduplication` varchar(150) DEFAULT '' COMMENT '去重字段',
|
||||
`status` smallint(6) DEFAULT '0' COMMENT '状态',
|
||||
`create_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
`update_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||
@ -50,6 +50,63 @@ CREATE TABLE `buyin_authorStatData_seekAuthor` (
|
||||
UNIQUE KEY `task_id` (`task_id`,`deduplication`) USING BTREE
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC;
|
||||
/*!40101 SET character_set_client = @saved_cs_client */;
|
||||
|
||||
--
|
||||
-- Table structure for table `buyin_contact_info`
|
||||
--
|
||||
|
||||
DROP TABLE IF EXISTS `buyin_contact_info`;
|
||||
/*!40101 SET @saved_cs_client = @@character_set_client */;
|
||||
/*!50503 SET character_set_client = utf8mb4 */;
|
||||
CREATE TABLE `buyin_contact_info` (
|
||||
`task_id` varchar(100) DEFAULT NULL COMMENT '项目id',
|
||||
`data` mediumtext COMMENT '数据结果',
|
||||
`deduplication` varchar(100) DEFAULT '' COMMENT '去重字段',
|
||||
`status` smallint(6) DEFAULT '0' COMMENT '状态',
|
||||
`create_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
`update_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||
KEY `task_id` (`task_id`,`deduplication`) USING BTREE
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC;
|
||||
/*!40101 SET character_set_client = @saved_cs_client */;
|
||||
|
||||
--
|
||||
-- Table structure for table `clean_buyin_contact_info`
|
||||
--
|
||||
|
||||
DROP TABLE IF EXISTS `clean_buyin_contact_info`;
|
||||
/*!40101 SET @saved_cs_client = @@character_set_client */;
|
||||
/*!50503 SET character_set_client = utf8mb4 */;
|
||||
CREATE TABLE `clean_buyin_contact_info` (
|
||||
`task_id` varchar(100) DEFAULT NULL COMMENT '项目id',
|
||||
`uid` varchar(100) DEFAULT '' COMMENT '唯一标识符',
|
||||
`times_left` varchar(50) DEFAULT '',
|
||||
`contact_value` varchar(50) DEFAULT '' COMMENT '联系方式',
|
||||
`deduplication` varchar(100) DEFAULT '' COMMENT '去重字段',
|
||||
`spider_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP COMMENT '爬虫抓取时间',
|
||||
UNIQUE KEY `task_id` (`deduplication`) USING BTREE,
|
||||
KEY `uid` (`uid`) USING BTREE
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC;
|
||||
/*!40101 SET character_set_client = @saved_cs_client */;
|
||||
|
||||
--
|
||||
-- Table structure for table `project_buyin_authorStatData`
|
||||
--
|
||||
|
||||
DROP TABLE IF EXISTS `project_buyin_authorStatData`;
|
||||
/*!40101 SET @saved_cs_client = @@character_set_client */;
|
||||
/*!50503 SET character_set_client = utf8mb4 */;
|
||||
CREATE TABLE `project_buyin_authorStatData` (
|
||||
`task_id` varchar(100) DEFAULT NULL COMMENT '项目id',
|
||||
`payload_get` text COMMENT 'get请求参数',
|
||||
`payload_post` varchar(255) DEFAULT '' COMMENT 'post请求参数',
|
||||
`deduplication` varchar(150) DEFAULT '' COMMENT '去重字段',
|
||||
`weight` tinyint(4) DEFAULT NULL COMMENT '权重',
|
||||
`status` tinyint(1) DEFAULT '0',
|
||||
`create_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
`update_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||
UNIQUE KEY `task_id` (`task_id`,`deduplication`) USING BTREE
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC;
|
||||
/*!40101 SET character_set_client = @saved_cs_client */;
|
||||
/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
|
||||
|
||||
/*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
|
||||
@ -60,4 +117,4 @@ CREATE TABLE `buyin_authorStatData_seekAuthor` (
|
||||
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
|
||||
/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
|
||||
|
||||
-- Dump completed on 2023-07-11 10:53:37
|
||||
-- Dump completed on 2023-07-11 20:41:43
|
||||
|
Loading…
x
Reference in New Issue
Block a user