2024-04-23 09:29:21 -07:00

491 lines
20 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# ==============================================================================
# Copyright (C) 2021 Evil0ctal
#
# This file is part of the Douyin_TikTok_Download_API project.
#
# This project is licensed under the Apache License 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
#         __
#        />  フ
#       |  _  _ l
#       ` ミ_x
#      /      | Feed me Stars ⭐
#     /  ヽ   ノ
#     │  | | |
#  / ̄|   | | |
#  | ( ̄ヽ__ヽ_)__)
#  \二つ
# ==============================================================================
#
# Contributor Link:
# - https://github.com/Evil0ctal
# - https://github.com/Johnserf-Seed
#
# ==============================================================================
import asyncio # 异步I/O
import time # 时间操作
import yaml # 配置文件
import os # 系统操作
# 基础爬虫客户端和TikTokAPI端点
from crawlers.base_crawler import BaseCrawler
from crawlers.tiktok.web.endpoints import TikTokAPIEndpoints
from crawlers.utils.utils import extract_valid_urls
# TikTok加密参数生成器
from crawlers.tiktok.web.utils import (
AwemeIdFetcher,
BogusManager,
SecUserIdFetcher,
TokenManager
)
# TikTok接口数据请求模型
from crawlers.tiktok.web.models import (
UserProfile,
UserPost,
UserLike,
UserMix,
UserCollect,
PostDetail,
UserPlayList,
PostComment,
PostCommentReply,
UserFans,
UserFollow
)
# 配置文件路径
path = os.path.abspath(os.path.dirname(__file__))
# 读取配置文件
with open(f"{path}/config.yaml", "r", encoding="utf-8") as f:
config = yaml.safe_load(f)
class TikTokWebCrawler:
def __init__(self):
self.proxy_pool = None
# 从配置文件中获取TikTok的请求头
async def get_tiktok_headers(self):
tiktok_config = config["TokenManager"]["tiktok"]
kwargs = {
"headers": {
"User-Agent": tiktok_config["headers"]["User-Agent"],
"Referer": tiktok_config["headers"]["Referer"],
"Cookie": tiktok_config["headers"]["Cookie"],
},
"proxies": {"http://": None, "https://": None},
}
return kwargs
"""-------------------------------------------------------handler接口列表-------------------------------------------------------"""
# 获取单个作品数据
async def fetch_one_video(self, itemId: str):
# 获取TikTok的实时Cookie
kwargs = await self.get_tiktok_headers()
# 创建一个基础爬虫
base_crawler = BaseCrawler(proxies=kwargs["proxies"], crawler_headers=kwargs["headers"])
async with base_crawler as crawler:
# 创建一个作品详情的BaseModel参数
params = PostDetail(itemId=itemId)
# 生成一个作品详情的带有加密参数的Endpoint
endpoint = BogusManager.model_2_endpoint(
TikTokAPIEndpoints.POST_DETAIL, params.dict(), kwargs["headers"]["User-Agent"]
)
response = await crawler.fetch_get_json(endpoint)
return response
# 获取用户的个人信息
async def fetch_user_profile(self, secUid: str, uniqueId: str):
# 获取TikTok的实时Cookie
kwargs = await self.get_tiktok_headers()
# 创建一个基础爬虫
base_crawler = BaseCrawler(proxies=kwargs["proxies"], crawler_headers=kwargs["headers"])
async with base_crawler as crawler:
# 创建一个用户详情的BaseModel参数
params = UserProfile(secUid=secUid, uniqueId=uniqueId)
# 生成一个用户详情的带有加密参数的Endpoint
endpoint = BogusManager.model_2_endpoint(
TikTokAPIEndpoints.USER_DETAIL, params.dict(), kwargs["headers"]["User-Agent"]
)
response = await crawler.fetch_get_json(endpoint)
return response
# 获取用户的作品列表
async def fetch_user_post(self, secUid: str, cursor: int = 0, count: int = 35, coverFormat: int = 2):
# 获取TikTok的实时Cookie
kwargs = await self.get_tiktok_headers()
# proxies = {"http://": 'http://43.159.29.191:24144', "https://": 'http://43.159.29.191:24144'}
# 创建一个基础爬虫
base_crawler = BaseCrawler(proxies=None, crawler_headers=kwargs["headers"])
async with base_crawler as crawler:
# 创建一个用户作品的BaseModel参数
params = UserPost(secUid=secUid, cursor=cursor, count=count, coverFormat=coverFormat)
# 生成一个用户作品的带有加密参数的Endpoint
endpoint = BogusManager.model_2_endpoint(
TikTokAPIEndpoints.USER_POST, params.dict(), kwargs["headers"]["User-Agent"]
)
response = await crawler.fetch_get_json(endpoint)
return response
# 获取用户的点赞列表
async def fetch_user_like(self, secUid: str, cursor: int = 0, count: int = 30, coverFormat: int = 2):
# 获取TikTok的实时Cookie
kwargs = await self.get_tiktok_headers()
# 创建一个基础爬虫
base_crawler = BaseCrawler(proxies=kwargs["proxies"], crawler_headers=kwargs["headers"])
async with base_crawler as crawler:
# 创建一个用户点赞的BaseModel参数
params = UserLike(secUid=secUid, cursor=cursor, count=count, coverFormat=coverFormat)
# 生成一个用户点赞的带有加密参数的Endpoint
endpoint = BogusManager.model_2_endpoint(
TikTokAPIEndpoints.USER_LIKE, params.dict(), kwargs["headers"]["User-Agent"]
)
response = await crawler.fetch_get_json(endpoint)
return response
# 获取用户的收藏列表
async def fetch_user_collect(self, cookie: str, secUid: str, cursor: int = 0, count: int = 30,
coverFormat: int = 2):
# 获取TikTok的实时Cookie
kwargs = await self.get_tiktok_headers()
kwargs["headers"]["Cookie"] = cookie
# 创建一个基础爬虫
base_crawler = BaseCrawler(proxies=kwargs["proxies"], crawler_headers=kwargs["headers"])
async with base_crawler as crawler:
# 创建一个用户收藏的BaseModel参数
params = UserCollect(cookie=cookie, secUid=secUid, cursor=cursor, count=count, coverFormat=coverFormat)
# 生成一个用户收藏的带有加密参数的Endpoint
endpoint = BogusManager.model_2_endpoint(
TikTokAPIEndpoints.USER_COLLECT, params.dict(), kwargs["headers"]["User-Agent"]
)
response = await crawler.fetch_get_json(endpoint)
return response
# 获取用户的播放列表
async def fetch_user_play_list(self, secUid: str, cursor: int = 0, count: int = 30):
# 获取TikTok的实时Cookie
kwargs = await self.get_tiktok_headers()
# 创建一个基础爬虫
base_crawler = BaseCrawler(proxies=kwargs["proxies"], crawler_headers=kwargs["headers"])
async with base_crawler as crawler:
# 创建一个用户播放列表的BaseModel参数
params = UserPlayList(secUid=secUid, cursor=cursor, count=count)
# 生成一个用户播放列表的带有加密参数的Endpoint
endpoint = BogusManager.model_2_endpoint(
TikTokAPIEndpoints.USER_PLAY_LIST, params.dict(), kwargs["headers"]["User-Agent"]
)
response = await crawler.fetch_get_json(endpoint)
return response
# 获取用户的合辑列表
async def fetch_user_mix(self, mixId: str, cursor: int = 0, count: int = 30):
# 获取TikTok的实时Cookie
kwargs = await self.get_tiktok_headers()
# 创建一个基础爬虫
base_crawler = BaseCrawler(proxies=kwargs["proxies"], crawler_headers=kwargs["headers"])
async with base_crawler as crawler:
# 创建一个用户合辑的BaseModel参数
params = UserMix(mixId=mixId, cursor=cursor, count=count)
# 生成一个用户合辑的带有加密参数的Endpoint
endpoint = BogusManager.model_2_endpoint(
TikTokAPIEndpoints.USER_MIX, params.dict(), kwargs["headers"]["User-Agent"]
)
response = await crawler.fetch_get_json(endpoint)
return response
# 获取作品的评论列表
async def fetch_post_comment(self, aweme_id: str, cursor: int = 0, count: int = 20, current_region: str = ""):
# 获取TikTok的实时Cookie
kwargs = await self.get_tiktok_headers()
# proxies = {"http://": 'http://43.159.18.174:25263', "https://": 'http://43.159.18.174:25263'}
# 创建一个基础爬虫
base_crawler = BaseCrawler(proxies=None, crawler_headers=kwargs["headers"])
async with base_crawler as crawler:
# 创建一个作品评论的BaseModel参数
params = PostComment(aweme_id=aweme_id, cursor=cursor, count=count, current_region=current_region)
# 生成一个作品评论的带有加密参数的Endpoint
endpoint = BogusManager.model_2_endpoint(
TikTokAPIEndpoints.POST_COMMENT, params.dict(), kwargs["headers"]["User-Agent"]
)
response = await crawler.fetch_get_json(endpoint)
return response
# 获取作品的评论回复列表
async def fetch_post_comment_reply(self, item_id: str, comment_id: str, cursor: int = 0, count: int = 20,
current_region: str = ""):
# 获取TikTok的实时Cookie
kwargs = await self.get_tiktok_headers()
# 创建一个基础爬虫
base_crawler = BaseCrawler(proxies=kwargs["proxies"], crawler_headers=kwargs["headers"])
async with base_crawler as crawler:
# 创建一个作品评论的BaseModel参数
params = PostCommentReply(item_id=item_id, comment_id=comment_id, cursor=cursor, count=count,
current_region=current_region)
# 生成一个作品评论的带有加密参数的Endpoint
endpoint = BogusManager.model_2_endpoint(
TikTokAPIEndpoints.POST_COMMENT_REPLY, params.dict(), kwargs["headers"]["User-Agent"]
)
response = await crawler.fetch_get_json(endpoint)
return response
# 获取用户的粉丝列表
async def fetch_user_fans(self, secUid: str, count: int = 30, maxCursor: int = 0, minCursor: int = 0):
# 获取TikTok的实时Cookie
kwargs = await self.get_tiktok_headers()
# 创建一个基础爬虫
base_crawler = BaseCrawler(proxies=kwargs["proxies"], crawler_headers=kwargs["headers"])
async with base_crawler as crawler:
# 创建一个用户关注的BaseModel参数
params = UserFans(secUid=secUid, count=count, maxCursor=maxCursor, minCursor=minCursor)
# 生成一个用户关注的带有加密参数的Endpoint
endpoint = BogusManager.model_2_endpoint(
TikTokAPIEndpoints.USER_FANS, params.dict(), kwargs["headers"]["User-Agent"]
)
response = await crawler.fetch_get_json(endpoint)
return response
# 获取用户的关注列表
async def fetch_user_follow(self, secUid: str, count: int = 30, maxCursor: int = 0, minCursor: int = 0):
# 获取TikTok的实时Cookie
kwargs = await self.get_tiktok_headers()
# 创建一个基础爬虫
base_crawler = BaseCrawler(proxies=kwargs["proxies"], crawler_headers=kwargs["headers"])
async with base_crawler as crawler:
# 创建一个用户关注的BaseModel参数
params = UserFollow(secUid=secUid, count=count, maxCursor=maxCursor, minCursor=minCursor)
# 生成一个用户关注的带有加密参数的Endpoint
endpoint = BogusManager.model_2_endpoint(
TikTokAPIEndpoints.USER_FOLLOW, params.dict(), kwargs["headers"]["User-Agent"]
)
response = await crawler.fetch_get_json(endpoint)
return response
"""-------------------------------------------------------utils接口列表-------------------------------------------------------"""
# 生成真实msToken
async def fetch_real_msToken(self):
result = {
"msToken": TokenManager().gen_real_msToken()
}
return result
# 生成ttwid
async def gen_ttwid(self, cookie: str):
result = {
"ttwid": TokenManager().gen_ttwid(cookie)
}
return result
# 生成xbogus
async def gen_xbogus(self, url: str, user_agent: str):
url = BogusManager.xb_str_2_endpoint(user_agent, url)
result = {
"url": url,
"x_bogus": url.split("&X-Bogus=")[1],
"user_agent": user_agent
}
return result
# 提取单个用户id
async def get_sec_user_id(self, url: str):
return await SecUserIdFetcher.get_secuid(url)
# 提取列表用户id
async def get_all_sec_user_id(self, urls: list):
# 提取有效URL
urls = extract_valid_urls(urls)
# 对于URL列表
return await SecUserIdFetcher.get_all_secuid(urls)
# 提取单个作品id
async def get_aweme_id(self, url: str):
return await AwemeIdFetcher.get_aweme_id(url)
# 提取列表作品id
async def get_all_aweme_id(self, urls: list):
# 提取有效URL
urls = extract_valid_urls(urls)
# 对于URL列表
return await AwemeIdFetcher.get_all_aweme_id(urls)
# 获取用户unique_id
async def get_unique_id(self, url: str):
return await SecUserIdFetcher.get_uniqueid(url)
# 获取列表unique_id列表
async def get_all_unique_id(self, urls: list):
# 提取有效URL
urls = extract_valid_urls(urls)
# 对于URL列表
return await SecUserIdFetcher.get_all_uniqueid(urls)
"""-------------------------------------------------------main接口列表-------------------------------------------------------"""
async def main(self):
# 获取单个作品数据
# item_id = "7339393672959757570"
# response = await self.fetch_one_video(item_id)
# print(response)
# 获取用户的个人信息
# secUid = "MS4wLjABAAAAfDPs6wbpBcMMb85xkvDGdyyyVAUS2YoVCT9P6WQ1bpuwEuPhL9eFtTmGvxw1lT2C"
# uniqueId = "c4shjaz"
# response = await self.fetch_user_profile(secUid, uniqueId)
# print(response)
# 获取用户的作品列表
# secUid = "MS4wLjABAAAAfDPs6wbpBcMMb85xkvDGdyyyVAUS2YoVCT9P6WQ1bpuwEuPhL9eFtTmGvxw1lT2C"
# cursor = 0
# count = 35
# coverFormat = 2
# response = await self.fetch_user_post(secUid, cursor, count, coverFormat)
# print(response)
# 获取用户的点赞列表
# secUid = "MS4wLjABAAAAq1iRXNduFZpY301UkVpJ1eQT60_NiWS9QQSeNqmNQEDJp0pOF8cpleNEdiJx5_IU"
# cursor = 0
# count = 30
# coverFormat = 2
# response = await self.fetch_user_like(secUid, cursor, count, coverFormat)
# print(response)
# 获取用户的收藏列表
# cookie = "put your cookie here"
# secUid = "MS4wLjABAAAAq1iRXNduFZpY301UkVpJ1eQT60_NiWS9QQSeNqmNQEDJp0pOF8cpleNEdiJx5_IU"
# cursor = 0
# count = 30
# coverFormat = 2
# response = await self.fetch_user_collect(cookie, secUid, cursor, count, coverFormat)
# print(response)
# 获取用户的播放列表
# secUid = "MS4wLjABAAAAtGboV-mJHSIQqh-SsG30QKweGhSqkr4xJLq1qqgAWDzu3vDO5LUhUcCP4UEY5LwC"
# cursor = 0
# count = 30
# response = await self.fetch_user_play_list(secUid, cursor, count)
# print(response)
# 获取用户的合辑列表
# mixId = "7101538765474106158"
# cursor = 0
# count = 30
# response = await self.fetch_user_mix(mixId, cursor, count)
# print(response)
# 获取作品的评论列表
# aweme_id = "7304809083817774382"
# cursor = 0
# count = 20
# current_region = ""
# response = await self.fetch_post_comment(aweme_id, cursor, count, current_region)
# print(response)
# 获取作品的评论回复列表
# item_id = "7304809083817774382"
# comment_id = "7304877760886588191"
# cursor = 0
# count = 20
# current_region = ""
# response = await self.fetch_post_comment_reply(item_id, comment_id, cursor, count, current_region)
# print(response)
# 获取用户的关注列表
# secUid = "MS4wLjABAAAAtGboV-mJHSIQqh-SsG30QKweGhSqkr4xJLq1qqgAWDzu3vDO5LUhUcCP4UEY5LwC"
# count = 30
# maxCursor = 0
# minCursor = 0
# response = await self.fetch_user_follow(secUid, count, maxCursor, minCursor)
# print(response)
# 获取用户的粉丝列表
# secUid = "MS4wLjABAAAAtGboV-mJHSIQqh-SsG30QKweGhSqkr4xJLq1qqgAWDzu3vDO5LUhUcCP4UEY5LwC"
# count = 30
# maxCursor = 0
# minCursor = 0
# response = await self.fetch_user_fans(secUid, count, maxCursor, minCursor)
# print(response)
"""-------------------------------------------------------utils接口列表-------------------------------------------------------"""
# # 生成真实msToken
# response = await self.fetch_real_msToken()
# print(response)
# 生成ttwid
# cookie = "put your cookie here"
# response = await self.gen_ttwid(cookie)
# print(response)
# 生成xbogus
# url = "https://www.tiktok.com/api/item/detail/?WebIdLastTime=1712665533&aid=1988&app_language=en&app_name=tiktok_web&browser_language=en-US&browser_name=Mozilla&browser_online=true&browser_platform=Win32&browser_version=5.0%20%28Windows%29&channel=tiktok_web&cookie_enabled=true&device_id=7349090360347690538&device_platform=web_pc&focus_state=true&from_page=user&history_len=4&is_fullscreen=false&is_page_visible=true&language=en&os=windows&priority_region=US&referer=&region=US&root_referer=https%3A%2F%2Fwww.tiktok.com%2F&screen_height=1080&screen_width=1920&webcast_language=en&tz_name=America%2FTijuana&msToken=AYFCEapCLbMrS8uTLBoYdUMeeVLbCdFQ_QF_-OcjzJw1CPr4JQhWUtagy0k4a9IITAqi5Qxr2Vdh9mgCbyGxTnvWLa4ZVY6IiSf6lcST-tr0IXfl-r_ZTpzvWDoQfqOVsWCTlSNkhAwB-tap5g==&itemId=7339393672959757570"
# user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
# response = await self.gen_xbogus(url, user_agent)
# print(response)
# 提取单个用户secUid
# url = "https://www.tiktok.com/@tiktok"
# response = await self.get_sec_user_id(url)
# print(response)
# 提取多个用户secUid
# urls = ["https://www.tiktok.com/@tiktok", "https://www.tiktok.com/@taylorswift"]
# response = await self.get_all_sec_user_id(urls)
# print(response)
# 提取单个作品id
# url = "https://www.tiktok.com/@taylorswift/video/7162153915952352558"
# response = await self.get_aweme_id(url)
# print(response)
# 提取多个作品id
# urls = ["https://www.tiktok.com/@taylorswift/video/7162153915952352558", "https://www.tiktok.com/@taylorswift/video/7137077445680745771"]
# response = await self.get_all_aweme_id(urls)
# print(response)
# 获取用户unique_id
# url = "https://www.tiktok.com/@tiktok"
# response = await self.get_unique_id(url)
# print(response)
# 获取多个用户unique_id
# urls = ["https://www.tiktok.com/@tiktok", "https://www.tiktok.com/@taylorswift"]
# response = await self.get_all_unique_id(urls)
# print(response)
# 占位
pass
if __name__ == "__main__":
# 初始化
TikTokWebCrawler = TikTokWebCrawler()
# 开始时间
start = time.time()
asyncio.run(TikTokWebCrawler.main())
# 结束时间
end = time.time()
print(f"耗时:{end - start}")