import asyncio from crawlers.douyin.web.web_crawler import DouyinWebCrawler # 导入抖音Web爬虫 from crawlers.tiktok.web.web_crawler import TikTokWebCrawler # 导入TikTok Web爬虫 from crawlers.tiktok.app.app_crawler import TikTokAPPCrawler # 导入TikTok App爬虫 class HybridCrawler: def __init__(self): self.DouyinWebCrawler = DouyinWebCrawler() self.TikTokWebCrawler = TikTokWebCrawler() self.TikTokAPPCrawler = TikTokAPPCrawler() async def hybrid_parsing_single_video(self, url: str, minimal: bool = False): # 解析抖音视频/Parse Douyin video if "douyin" in url: platform = "douyin" aweme_id = await self.DouyinWebCrawler.get_aweme_id(url) data = await self.DouyinWebCrawler.fetch_one_video(aweme_id) data = data.get("aweme_detail") # $.aweme_detail.aweme_type aweme_type = data.get("aweme_type") # 解析TikTok视频/Parse TikTok video elif "tiktok" in url: platform = "tiktok" aweme_id = await self.TikTokWebCrawler.get_aweme_id(url) data = await self.TikTokAPPCrawler.fetch_one_video(aweme_id) # $.aweme_type aweme_type = data.get("aweme_type") else: raise ValueError("hybrid_parsing_single_video: Cannot judge the video source from the URL.") # 检查是否需要返回最小数据/Check if minimal data is required if not minimal: return data # 如果是最小数据,处理数据/If it is minimal data, process the data url_type_code_dict = { # common 0: 'video', # Douyin 2: 'image', 4: 'video', 68: 'image', # TikTok 51: 'video', 55: 'video', 58: 'video', 61: 'video', 150: 'image' } # 判断链接类型/Judge link type url_type = url_type_code_dict.get(aweme_type, 'video') """ 以下为(视频||图片)数据处理的四个方法,如果你需要自定义数据处理请在这里修改. The following are four methods of (video || image) data processing. If you need to customize data processing, please modify it here. """ """ 创建已知数据字典(索引相同),稍后使用.update()方法更新数据 Create a known data dictionary (index the same), and then use the .update() method to update the data """ result_data = { 'type': url_type, 'platform': platform, 'aweme_id': aweme_id, 'desc': data.get("desc"), 'create_time': data.get("create_time"), 'author': data.get("author"), 'music': data.get("music"), 'statistics': data.get("statistics"), 'cover_data': { 'cover': data.get("video").get("cover"), 'origin_cover': data.get("video").get("origin_cover"), 'dynamic_cover': data.get("video").get("dynamic_cover") }, 'hashtags': data.get('text_extra'), } # 创建一个空变量,稍后使用.update()方法更新数据/Create an empty variable and use the .update() method to update the data api_data = None # 判断链接类型并处理数据/Judge link type and process data # 抖音数据处理/Douyin data processing if platform == 'douyin': # 抖音视频数据处理/Douyin video data processing if url_type == 'video': # 将信息储存在字典中/Store information in a dictionary uri = data['video']['play_addr']['uri'] wm_video_url_HQ = data['video']['play_addr']['url_list'][0] wm_video_url = f"https://aweme.snssdk.com/aweme/v1/playwm/?video_id={uri}&radio=1080p&line=0" nwm_video_url_HQ = wm_video_url_HQ.replace('playwm', 'play') nwm_video_url = f"https://aweme.snssdk.com/aweme/v1/play/?video_id={uri}&ratio=1080p&line=0" api_data = { 'video_data': { 'wm_video_url': wm_video_url, 'wm_video_url_HQ': wm_video_url_HQ, 'nwm_video_url': nwm_video_url, 'nwm_video_url_HQ': nwm_video_url_HQ } } # 抖音图片数据处理/Douyin image data processing elif url_type == 'image': # 无水印图片列表/No watermark image list no_watermark_image_list = [] # 有水印图片列表/With watermark image list watermark_image_list = [] # 遍历图片列表/Traverse image list for i in data['images']: no_watermark_image_list.append(i['url_list'][0]) watermark_image_list.append(i['download_url_list'][0]) api_data = { 'image_data': { 'no_watermark_image_list': no_watermark_image_list, 'watermark_image_list': watermark_image_list } } # TikTok数据处理/TikTok data processing elif platform == 'tiktok': # TikTok视频数据处理/TikTok video data processing if url_type == 'video': # 将信息储存在字典中/Store information in a dictionary wm_video = data['video']['download_addr']['url_list'][0] api_data = { 'video_data': { 'wm_video_url': wm_video, 'wm_video_url_HQ': wm_video, 'nwm_video_url': data['video']['play_addr']['url_list'][0], 'nwm_video_url_HQ': data['video']['bit_rate'][0]['play_addr']['url_list'][0] } } # TikTok图片数据处理/TikTok image data processing elif url_type == 'image': # 无水印图片列表/No watermark image list no_watermark_image_list = [] # 有水印图片列表/With watermark image list watermark_image_list = [] for i in data['image_post_info']['images']: no_watermark_image_list.append(i['display_image']['url_list'][0]) watermark_image_list.append(i['owner_watermark_image']['url_list'][0]) api_data = { 'image_data': { 'no_watermark_image_list': no_watermark_image_list, 'watermark_image_list': watermark_image_list } } # 更新数据/Update data result_data.update(api_data) return result_data async def main(self): # 测试混合解析单一视频接口/Test hybrid parsing single video endpoint # url = "https://v.douyin.com/L4FJNR3/" url = "https://www.tiktok.com/@evil0ctal/video/7156033831819037994" minimal = True result = await self.hybrid_parsing_single_video(url, minimal=minimal) print(result) # 占位 pass if __name__ == '__main__': # 实例化混合爬虫/Instantiate hybrid crawler hybird_crawler = HybridCrawler() # 运行测试代码/Run test code asyncio.run(hybird_crawler.main())