mirror of
https://github.com/luzhisheng/js_reverse.git
synced 2025-04-22 20:44:16 +08:00
抖店精选联盟数据
This commit is contained in:
parent
30faf484dd
commit
5bff450e38
@ -1,3 +1,15 @@
|
|||||||
# 文档
|
# 文档
|
||||||
|
|
||||||
mitmdump -s ./spider/buyin_authorStatData_seekAuthor_mitm.py -p 9999 -q
|
## 如何运行
|
||||||
|
|
||||||
|
启动代理
|
||||||
|
|
||||||
|
mitmdump -s ./main_spider.py -q -p 9999
|
||||||
|
|
||||||
|
启动浏览器
|
||||||
|
|
||||||
|
google-chrome --remote-debugging-port=9222 --user-data-dir='/home/ayf/project/js_reverse/抖音js逆向学习/抖店精选联盟数据/cn'
|
||||||
|
|
||||||
|
启动脚本控制浏览器
|
||||||
|
|
||||||
|
browser_baiyin.py
|
@ -1,4 +1,4 @@
|
|||||||
from dispatch.base import Base
|
from base import Base
|
||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
@ -0,0 +1,59 @@
|
|||||||
|
from base import Base
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
class CleanBuyinAuthorStatDataSeekAutho(Base):
|
||||||
|
name = 'buyin_authorStatData_seekAuthor'
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super(CleanBuyinAuthorStatDataSeekAutho, self).__init__()
|
||||||
|
self.table = self.name
|
||||||
|
self.clean_table = "clean_" + self.table
|
||||||
|
|
||||||
|
def process_item(self, resp):
|
||||||
|
list_res = []
|
||||||
|
if not resp:
|
||||||
|
self.log(f'清洗{self.table}数据-不存在')
|
||||||
|
return ''
|
||||||
|
|
||||||
|
for task_id, data, deduplication, update_time in resp:
|
||||||
|
contact_info = json.loads(data).get('contact_info')
|
||||||
|
item = {
|
||||||
|
"task_id": task_id,
|
||||||
|
"author_base_uid": deduplication.replace("uid=", ""),
|
||||||
|
"author_base_nickname": contact_info.get('times_left'),
|
||||||
|
"author_base_avatar": contact_info.get('contact_value'),
|
||||||
|
"author_base_fans_num": contact_info.get('contact_value'),
|
||||||
|
"author_base_gender": contact_info.get('contact_value'),
|
||||||
|
"author_base_city": contact_info.get('contact_value'),
|
||||||
|
"deduplication": deduplication + '×_left=' + str(contact_info.get('times_left')),
|
||||||
|
"spider_time": update_time
|
||||||
|
}
|
||||||
|
list_res.append(item)
|
||||||
|
db_res = self.eb_supports.insert_many(self.clean_table,
|
||||||
|
list_res,
|
||||||
|
conflict=[
|
||||||
|
"task_id", "uid", "times_left",
|
||||||
|
"contact_value", "contact_value", "deduplication",
|
||||||
|
"spider_time"
|
||||||
|
]
|
||||||
|
)
|
||||||
|
if db_res >= 0:
|
||||||
|
return True, self.table, db_res
|
||||||
|
else:
|
||||||
|
return False, self.table, db_res
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
offset = 0
|
||||||
|
qc = CleanBuyinAuthorStatDataSeekAutho()
|
||||||
|
while True:
|
||||||
|
sql = f"""
|
||||||
|
select task_id, data, deduplication, update_time from buyin_authorStatData_seekAuthor where
|
||||||
|
date_sub(CURDATE(),INTERVAL 2 DAY) <= DATE(update_time) LIMIT 1000 OFFSET {offset};
|
||||||
|
"""
|
||||||
|
res = qc.eb_supports.query(sql)
|
||||||
|
if not res:
|
||||||
|
break
|
||||||
|
qc.process_item(res)
|
||||||
|
offset += 1000
|
56
抖音js逆向学习/抖店精选联盟数据/extractors/clean_buyin_contact_info.py
Normal file
56
抖音js逆向学习/抖店精选联盟数据/extractors/clean_buyin_contact_info.py
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
from base import Base
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
class CleanBuyinContactInfo(Base):
|
||||||
|
name = 'buyin_contact_info'
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super(CleanBuyinContactInfo, self).__init__()
|
||||||
|
self.table = self.name
|
||||||
|
self.clean_table = "clean_" + self.table
|
||||||
|
|
||||||
|
def process_item(self, resp):
|
||||||
|
list_res = []
|
||||||
|
if not resp:
|
||||||
|
self.log(f'清洗{self.table}数据-不存在')
|
||||||
|
return ''
|
||||||
|
|
||||||
|
for task_id, data, deduplication, update_time in resp:
|
||||||
|
contact_info = json.loads(data).get('contact_info')
|
||||||
|
item = {
|
||||||
|
"task_id": task_id,
|
||||||
|
"uid": deduplication.replace("uid=", ""),
|
||||||
|
"times_left": contact_info.get('times_left'),
|
||||||
|
"contact_value": contact_info.get('contact_value'),
|
||||||
|
"deduplication": deduplication + '×_left=' + str(contact_info.get('times_left')),
|
||||||
|
"spider_time": update_time
|
||||||
|
}
|
||||||
|
list_res.append(item)
|
||||||
|
db_res = self.eb_supports.insert_many(self.clean_table,
|
||||||
|
list_res,
|
||||||
|
conflict=[
|
||||||
|
"task_id", "uid", "times_left",
|
||||||
|
"contact_value", "contact_value", "deduplication",
|
||||||
|
"spider_time"
|
||||||
|
]
|
||||||
|
)
|
||||||
|
if db_res >= 0:
|
||||||
|
return True, self.table, db_res
|
||||||
|
else:
|
||||||
|
return False, self.table, db_res
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
offset = 0
|
||||||
|
qc = CleanBuyinContactInfo()
|
||||||
|
while True:
|
||||||
|
sql = f"""
|
||||||
|
select task_id, data, deduplication, update_time from buyin_contact_info where
|
||||||
|
date_sub(CURDATE(),INTERVAL 2 DAY) <= DATE(update_time) LIMIT 1000 OFFSET {offset};
|
||||||
|
"""
|
||||||
|
res = qc.eb_supports.query(sql)
|
||||||
|
if not res:
|
||||||
|
break
|
||||||
|
qc.process_item(res)
|
||||||
|
offset += 1000
|
@ -1,12 +0,0 @@
|
|||||||
from dao.mysql_dao import StoreMysqlPool
|
|
||||||
from datetime import datetime
|
|
||||||
import settings
|
|
||||||
|
|
||||||
|
|
||||||
class Base(object):
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.eb_supports = StoreMysqlPool(**settings.mysql_server_baiyin)
|
|
||||||
|
|
||||||
def log(self, s):
|
|
||||||
print('【%s】 %s' % (datetime.now(), s), flush=True)
|
|
@ -1,7 +1,7 @@
|
|||||||
from multiprocessing import Queue
|
from multiprocessing import Queue
|
||||||
from selenium.webdriver.common.by import By
|
from selenium.webdriver.common.by import By
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
from spider.base import Base
|
from base import Base
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from spider.base import Base
|
from base import Base
|
||||||
from urllib.parse import parse_qsl, urlsplit
|
from urllib.parse import parse_qsl, urlsplit
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
@ -25,7 +25,7 @@ DROP TABLE IF EXISTS `buyin_authorStatData_authorOverviewV2`;
|
|||||||
CREATE TABLE `buyin_authorStatData_authorOverviewV2` (
|
CREATE TABLE `buyin_authorStatData_authorOverviewV2` (
|
||||||
`task_id` varchar(100) DEFAULT NULL COMMENT '项目id',
|
`task_id` varchar(100) DEFAULT NULL COMMENT '项目id',
|
||||||
`data` mediumtext COMMENT '数据结果',
|
`data` mediumtext COMMENT '数据结果',
|
||||||
`deduplication` varchar(100) DEFAULT '' COMMENT '去重字段',
|
`deduplication` varchar(150) DEFAULT '' COMMENT '去重字段',
|
||||||
`status` smallint(6) DEFAULT '0' COMMENT '状态',
|
`status` smallint(6) DEFAULT '0' COMMENT '状态',
|
||||||
`create_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
|
`create_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
|
||||||
`update_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
`update_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||||
@ -50,6 +50,63 @@ CREATE TABLE `buyin_authorStatData_seekAuthor` (
|
|||||||
UNIQUE KEY `task_id` (`task_id`,`deduplication`) USING BTREE
|
UNIQUE KEY `task_id` (`task_id`,`deduplication`) USING BTREE
|
||||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC;
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC;
|
||||||
/*!40101 SET character_set_client = @saved_cs_client */;
|
/*!40101 SET character_set_client = @saved_cs_client */;
|
||||||
|
|
||||||
|
--
|
||||||
|
-- Table structure for table `buyin_contact_info`
|
||||||
|
--
|
||||||
|
|
||||||
|
DROP TABLE IF EXISTS `buyin_contact_info`;
|
||||||
|
/*!40101 SET @saved_cs_client = @@character_set_client */;
|
||||||
|
/*!50503 SET character_set_client = utf8mb4 */;
|
||||||
|
CREATE TABLE `buyin_contact_info` (
|
||||||
|
`task_id` varchar(100) DEFAULT NULL COMMENT '项目id',
|
||||||
|
`data` mediumtext COMMENT '数据结果',
|
||||||
|
`deduplication` varchar(100) DEFAULT '' COMMENT '去重字段',
|
||||||
|
`status` smallint(6) DEFAULT '0' COMMENT '状态',
|
||||||
|
`create_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
`update_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||||
|
KEY `task_id` (`task_id`,`deduplication`) USING BTREE
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC;
|
||||||
|
/*!40101 SET character_set_client = @saved_cs_client */;
|
||||||
|
|
||||||
|
--
|
||||||
|
-- Table structure for table `clean_buyin_contact_info`
|
||||||
|
--
|
||||||
|
|
||||||
|
DROP TABLE IF EXISTS `clean_buyin_contact_info`;
|
||||||
|
/*!40101 SET @saved_cs_client = @@character_set_client */;
|
||||||
|
/*!50503 SET character_set_client = utf8mb4 */;
|
||||||
|
CREATE TABLE `clean_buyin_contact_info` (
|
||||||
|
`task_id` varchar(100) DEFAULT NULL COMMENT '项目id',
|
||||||
|
`uid` varchar(100) DEFAULT '' COMMENT '唯一标识符',
|
||||||
|
`times_left` varchar(50) DEFAULT '',
|
||||||
|
`contact_value` varchar(50) DEFAULT '' COMMENT '联系方式',
|
||||||
|
`deduplication` varchar(100) DEFAULT '' COMMENT '去重字段',
|
||||||
|
`spider_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP COMMENT '爬虫抓取时间',
|
||||||
|
UNIQUE KEY `task_id` (`deduplication`) USING BTREE,
|
||||||
|
KEY `uid` (`uid`) USING BTREE
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC;
|
||||||
|
/*!40101 SET character_set_client = @saved_cs_client */;
|
||||||
|
|
||||||
|
--
|
||||||
|
-- Table structure for table `project_buyin_authorStatData`
|
||||||
|
--
|
||||||
|
|
||||||
|
DROP TABLE IF EXISTS `project_buyin_authorStatData`;
|
||||||
|
/*!40101 SET @saved_cs_client = @@character_set_client */;
|
||||||
|
/*!50503 SET character_set_client = utf8mb4 */;
|
||||||
|
CREATE TABLE `project_buyin_authorStatData` (
|
||||||
|
`task_id` varchar(100) DEFAULT NULL COMMENT '项目id',
|
||||||
|
`payload_get` text COMMENT 'get请求参数',
|
||||||
|
`payload_post` varchar(255) DEFAULT '' COMMENT 'post请求参数',
|
||||||
|
`deduplication` varchar(150) DEFAULT '' COMMENT '去重字段',
|
||||||
|
`weight` tinyint(4) DEFAULT NULL COMMENT '权重',
|
||||||
|
`status` tinyint(1) DEFAULT '0',
|
||||||
|
`create_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
`update_time` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
|
||||||
|
UNIQUE KEY `task_id` (`task_id`,`deduplication`) USING BTREE
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC;
|
||||||
|
/*!40101 SET character_set_client = @saved_cs_client */;
|
||||||
/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
|
/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
|
||||||
|
|
||||||
/*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
|
/*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
|
||||||
@ -60,4 +117,4 @@ CREATE TABLE `buyin_authorStatData_seekAuthor` (
|
|||||||
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
|
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
|
||||||
/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
|
/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
|
||||||
|
|
||||||
-- Dump completed on 2023-07-11 10:53:37
|
-- Dump completed on 2023-07-11 20:41:43
|
||||||
|
Loading…
x
Reference in New Issue
Block a user