Douyin_TikTok_Download_API/crawlers/hybrid/hybrid_crawler.py
2024-04-22 21:02:42 -07:00

174 lines
7.5 KiB
Python

import asyncio
from crawlers.douyin.web.web_crawler import DouyinWebCrawler # 导入抖音Web爬虫
from crawlers.tiktok.web.web_crawler import TikTokWebCrawler # 导入TikTok Web爬虫
from crawlers.tiktok.app.app_crawler import TikTokAPPCrawler # 导入TikTok App爬虫
class HybridCrawler:
def __init__(self):
self.DouyinWebCrawler = DouyinWebCrawler()
self.TikTokWebCrawler = TikTokWebCrawler()
self.TikTokAPPCrawler = TikTokAPPCrawler()
async def hybrid_parsing_single_video(self, url: str, minimal: bool = False):
# 解析抖音视频/Parse Douyin video
if "douyin" in url:
platform = "douyin"
aweme_id = await self.DouyinWebCrawler.get_aweme_id(url)
data = await self.DouyinWebCrawler.fetch_one_video(aweme_id)
data = data.get("aweme_detail")
# $.aweme_detail.aweme_type
aweme_type = data.get("aweme_type")
# 解析TikTok视频/Parse TikTok video
elif "tiktok" in url:
platform = "tiktok"
aweme_id = await self.TikTokWebCrawler.get_aweme_id(url)
data = await self.TikTokAPPCrawler.fetch_one_video(aweme_id)
# $.aweme_type
aweme_type = data.get("aweme_type")
else:
raise ValueError("hybrid_parsing_single_video: Cannot judge the video source from the URL.")
# 检查是否需要返回最小数据/Check if minimal data is required
if not minimal:
return data
# 如果是最小数据,处理数据/If it is minimal data, process the data
url_type_code_dict = {
# common
0: 'video',
# Douyin
2: 'image',
4: 'video',
68: 'image',
# TikTok
51: 'video',
55: 'video',
58: 'video',
61: 'video',
150: 'image'
}
# 判断链接类型/Judge link type
url_type = url_type_code_dict.get(aweme_type, 'video')
"""
以下为(视频||图片)数据处理的四个方法,如果你需要自定义数据处理请在这里修改.
The following are four methods of (video || image) data processing.
If you need to customize data processing, please modify it here.
"""
"""
创建已知数据字典(索引相同),稍后使用.update()方法更新数据
Create a known data dictionary (index the same),
and then use the .update() method to update the data
"""
result_data = {
'type': url_type,
'platform': platform,
'aweme_id': aweme_id,
'desc': data.get("desc"),
'create_time': data.get("create_time"),
'author': data.get("author"),
'music': data.get("music"),
'statistics': data.get("statistics"),
'cover_data': {
'cover': data.get("video").get("cover"),
'origin_cover': data.get("video").get("origin_cover"),
'dynamic_cover': data.get("video").get("dynamic_cover")
},
'hashtags': data.get('text_extra'),
}
# 创建一个空变量,稍后使用.update()方法更新数据/Create an empty variable and use the .update() method to update the data
api_data = None
# 判断链接类型并处理数据/Judge link type and process data
# 抖音数据处理/Douyin data processing
if platform == 'douyin':
# 抖音视频数据处理/Douyin video data processing
if url_type == 'video':
# 将信息储存在字典中/Store information in a dictionary
uri = data['video']['play_addr']['uri']
wm_video_url_HQ = data['video']['play_addr']['url_list'][0]
wm_video_url = f"https://aweme.snssdk.com/aweme/v1/playwm/?video_id={uri}&radio=1080p&line=0"
nwm_video_url_HQ = wm_video_url_HQ.replace('playwm', 'play')
nwm_video_url = f"https://aweme.snssdk.com/aweme/v1/play/?video_id={uri}&ratio=1080p&line=0"
api_data = {
'video_data':
{
'wm_video_url': wm_video_url,
'wm_video_url_HQ': wm_video_url_HQ,
'nwm_video_url': nwm_video_url,
'nwm_video_url_HQ': nwm_video_url_HQ
}
}
# 抖音图片数据处理/Douyin image data processing
elif url_type == 'image':
# 无水印图片列表/No watermark image list
no_watermark_image_list = []
# 有水印图片列表/With watermark image list
watermark_image_list = []
# 遍历图片列表/Traverse image list
for i in data['images']:
no_watermark_image_list.append(i['url_list'][0])
watermark_image_list.append(i['download_url_list'][0])
api_data = {
'image_data':
{
'no_watermark_image_list': no_watermark_image_list,
'watermark_image_list': watermark_image_list
}
}
# TikTok数据处理/TikTok data processing
elif platform == 'tiktok':
# TikTok视频数据处理/TikTok video data processing
if url_type == 'video':
# 将信息储存在字典中/Store information in a dictionary
wm_video = data['video']['download_addr']['url_list'][0]
api_data = {
'video_data':
{
'wm_video_url': wm_video,
'wm_video_url_HQ': wm_video,
'nwm_video_url': data['video']['play_addr']['url_list'][0],
'nwm_video_url_HQ': data['video']['bit_rate'][0]['play_addr']['url_list'][0]
}
}
# TikTok图片数据处理/TikTok image data processing
elif url_type == 'image':
# 无水印图片列表/No watermark image list
no_watermark_image_list = []
# 有水印图片列表/With watermark image list
watermark_image_list = []
for i in data['image_post_info']['images']:
no_watermark_image_list.append(i['display_image']['url_list'][0])
watermark_image_list.append(i['owner_watermark_image']['url_list'][0])
api_data = {
'image_data':
{
'no_watermark_image_list': no_watermark_image_list,
'watermark_image_list': watermark_image_list
}
}
# 更新数据/Update data
result_data.update(api_data)
return result_data
async def main(self):
# 测试混合解析单一视频接口/Test hybrid parsing single video endpoint
# url = "https://v.douyin.com/L4FJNR3/"
url = "https://www.tiktok.com/@evil0ctal/video/7156033831819037994"
minimal = True
result = await self.hybrid_parsing_single_video(url, minimal=minimal)
print(result)
# 占位
pass
if __name__ == '__main__':
# 实例化混合爬虫/Instantiate hybrid crawler
hybird_crawler = HybridCrawler()
# 运行测试代码/Run test code
asyncio.run(hybird_crawler.main())