2024-12-21 18:24:39 -08:00

224 lines
9.7 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# ==============================================================================
# Copyright (C) 2021 Evil0ctal
#
# This file is part of the Douyin_TikTok_Download_API project.
#
# This project is licensed under the Apache License 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
#         __
#        />  フ
#       |  _  _ l
#       ` ミ_x
#      /      | Feed me Stars ⭐
#     /  ヽ   ノ
#     │  | | |
#  / ̄|   | | |
#  | ( ̄ヽ__ヽ_)__)
#  \二つ
# ==============================================================================
#
# Contributor Link:
# - https://github.com/Evil0ctal
#
# ==============================================================================
import asyncio
from crawlers.douyin.web.web_crawler import DouyinWebCrawler # 导入抖音Web爬虫
from crawlers.tiktok.web.web_crawler import TikTokWebCrawler # 导入TikTok Web爬虫
from crawlers.tiktok.app.app_crawler import TikTokAPPCrawler # 导入TikTok App爬虫
class HybridCrawler:
def __init__(self):
self.DouyinWebCrawler = DouyinWebCrawler()
self.TikTokWebCrawler = TikTokWebCrawler()
self.TikTokAPPCrawler = TikTokAPPCrawler()
async def hybrid_parsing_single_video(self, url: str, minimal: bool = False):
# 解析抖音视频/Parse Douyin video
if "douyin" in url:
platform = "douyin"
aweme_id = await self.DouyinWebCrawler.get_aweme_id(url)
data = await self.DouyinWebCrawler.fetch_one_video(aweme_id)
data = data.get("aweme_detail")
# $.aweme_detail.aweme_type
aweme_type = data.get("aweme_type")
# 解析TikTok视频/Parse TikTok video
elif "tiktok" in url:
platform = "tiktok"
aweme_id = await self.TikTokWebCrawler.get_aweme_id(url)
# 2024-09-14: Switch to TikTokAPPCrawler instead of TikTokWebCrawler
# data = await self.TikTokWebCrawler.fetch_one_video(aweme_id)
# data = data.get("itemInfo").get("itemStruct")
data = await self.TikTokAPPCrawler.fetch_one_video(aweme_id)
# $.imagePost exists if aweme_type is photo
aweme_type = data.get("aweme_type")
else:
raise ValueError("hybrid_parsing_single_video: Cannot judge the video source from the URL.")
# 检查是否需要返回最小数据/Check if minimal data is required
if not minimal:
return data
# 如果是最小数据,处理数据/If it is minimal data, process the data
url_type_code_dict = {
# common
0: 'video',
# Douyin
2: 'image',
4: 'video',
68: 'image',
# TikTok
51: 'video',
55: 'video',
58: 'video',
61: 'video',
150: 'image'
}
# 判断链接类型/Judge link type
url_type = url_type_code_dict.get(aweme_type, 'video')
# print(f"url_type: {url_type}")
"""
以下为(视频||图片)数据处理的四个方法,如果你需要自定义数据处理请在这里修改.
The following are four methods of (video || image) data processing.
If you need to customize data processing, please modify it here.
"""
"""
创建已知数据字典(索引相同),稍后使用.update()方法更新数据
Create a known data dictionary (index the same),
and then use the .update() method to update the data
"""
result_data = {
'type': url_type,
'platform': platform,
'aweme_id': aweme_id,
'desc': data.get("desc"),
'create_time': data.get("create_time"),
'author': data.get("author"),
'music': data.get("music"),
'statistics': data.get("statistics"),
'cover_data': {
'cover': data.get("video").get("cover"),
'origin_cover': data.get("video").get("origin_cover"),
'dynamic_cover': data.get("video").get("dynamic_cover")
},
'hashtags': data.get('text_extra'),
}
# 创建一个空变量,稍后使用.update()方法更新数据/Create an empty variable and use the .update() method to update the data
api_data = None
# 判断链接类型并处理数据/Judge link type and process data
# 抖音数据处理/Douyin data processing
if platform == 'douyin':
# 抖音视频数据处理/Douyin video data processing
if url_type == 'video':
# 将信息储存在字典中/Store information in a dictionary
uri = data['video']['play_addr']['uri']
wm_video_url_HQ = data['video']['play_addr']['url_list'][0]
wm_video_url = f"https://aweme.snssdk.com/aweme/v1/playwm/?video_id={uri}&radio=1080p&line=0"
nwm_video_url_HQ = wm_video_url_HQ.replace('playwm', 'play')
nwm_video_url = f"https://aweme.snssdk.com/aweme/v1/play/?video_id={uri}&ratio=1080p&line=0"
api_data = {
'video_data':
{
'wm_video_url': wm_video_url,
'wm_video_url_HQ': wm_video_url_HQ,
'nwm_video_url': nwm_video_url,
'nwm_video_url_HQ': nwm_video_url_HQ
}
}
# 抖音图片数据处理/Douyin image data processing
elif url_type == 'image':
# 无水印图片列表/No watermark image list
no_watermark_image_list = []
# 有水印图片列表/With watermark image list
watermark_image_list = []
# 遍历图片列表/Traverse image list
for i in data['images']:
no_watermark_image_list.append(i['url_list'][0])
watermark_image_list.append(i['download_url_list'][0])
api_data = {
'image_data':
{
'no_watermark_image_list': no_watermark_image_list,
'watermark_image_list': watermark_image_list
}
}
# TikTok数据处理/TikTok data processing
elif platform == 'tiktok':
# TikTok视频数据处理/TikTok video data processing
if url_type == 'video':
# 将信息储存在字典中/Store information in a dictionary
# wm_video = data['video']['downloadAddr']
# wm_video = data['video']['download_addr']['url_list'][0]
wm_video = (
data.get('video', {})
.get('download_addr', {})
.get('url_list', [None])[0]
)
api_data = {
'video_data':
{
'wm_video_url': wm_video,
'wm_video_url_HQ': wm_video,
# 'nwm_video_url': data['video']['playAddr'],
'nwm_video_url': data['video']['play_addr']['url_list'][0],
# 'nwm_video_url_HQ': data['video']['bitrateInfo'][0]['PlayAddr']['UrlList'][0]
'nwm_video_url_HQ': data['video']['bit_rate'][0]['play_addr']['url_list'][0]
}
}
# TikTok图片数据处理/TikTok image data processing
elif url_type == 'image':
# 无水印图片列表/No watermark image list
no_watermark_image_list = []
# 有水印图片列表/With watermark image list
watermark_image_list = []
for i in data['image_post_info']['images']:
no_watermark_image_list.append(i['display_image']['url_list'][0])
watermark_image_list.append(i['owner_watermark_image']['url_list'][0])
api_data = {
'image_data':
{
'no_watermark_image_list': no_watermark_image_list,
'watermark_image_list': watermark_image_list
}
}
# 更新数据/Update data
result_data.update(api_data)
return result_data
async def main(self):
# 测试混合解析单一视频接口/Test hybrid parsing single video endpoint
# url = "https://v.douyin.com/L4FJNR3/"
# url = "https://www.tiktok.com/@taylorswift/video/7359655005701311786"
url = "https://www.tiktok.com/@flukegk83/video/7360734489271700753"
# url = "https://www.tiktok.com/@minecraft/photo/7369296852669205791"
minimal = True
result = await self.hybrid_parsing_single_video(url, minimal=minimal)
print(result)
# 占位
pass
if __name__ == '__main__':
# 实例化混合爬虫/Instantiate hybrid crawler
hybird_crawler = HybridCrawler()
# 运行测试代码/Run test code
asyncio.run(hybird_crawler.main())