diff --git a/scraper.py b/scraper.py index 2e90135..019e366 100644 --- a/scraper.py +++ b/scraper.py @@ -2,7 +2,7 @@ # -*- encoding: utf-8 -*- # @Author: https://github.com/Evil0ctal/ # @Time: 2021/11/06 -# @Update: 2022/09/04 +# @Update: 2022/09/16 # @Function: # 核心代码,估值1块(๑•̀ㅂ•́)و✧ # 用于爬取Douyin/TikTok数据并以字典形式返回。 @@ -20,7 +20,7 @@ class Scraper: """ Scraper.douyin(link): 输入参数为抖音视频/图集链接,完成解析后返回字典。 - + Scraper.tiktok(link): 输入参数为TikTok视频/图集链接,完成解析后返回字典。 """ @@ -78,7 +78,8 @@ class Scraper: long_url = r.headers['Location'] # 判断是否为个人主页链接 if 'user' in long_url: - return {'status': 'failed', 'reason': '暂不支持个人主页批量解析', 'function': 'Scraper.douyin()', + return {'status': 'failed', 'reason': '暂不支持个人主页批量解析', + 'function': 'Scraper.douyin()', 'value': original_url} except: # 报错后判断为长链接,直接截取视频id @@ -102,7 +103,8 @@ class Scraper: aweme_id = str(js['item_list'][0]['aweme_id']) share_url = re.sub("/\\?.*", "", js['item_list'][0]['share_url']) if share_url is None: - share_url = ("https://www.iesdouyin.com/share/video/" + aweme_id) if aweme_id is not None else original_url; + share_url = ( + "https://www.iesdouyin.com/share/video/" + aweme_id) if aweme_id is not None else original_url; try: music_share_url = "https://www.iesdouyin.com/share/music/" + str(js['item_list'][0]['music']['mid']) except: @@ -129,7 +131,8 @@ class Scraper: for key in js['item_list'][0]: if key == 'music': # 图集BGM链接 - album_music = str(js['item_list'][0]['music']['play_url']['url_list'][0] if len(js['item_list'][0]['music']['play_url']['url_list']) > 0 else 'No BGM found') + album_music = str(js['item_list'][0]['music']['play_url']['url_list'][0] if len( + js['item_list'][0]['music']['play_url']['url_list']) > 0 else 'No BGM found') # 图集BGM标题 album_music_title = str(js['item_list'][0]['music']['title']) # 图集BGM作者 @@ -351,50 +354,51 @@ class Scraper: except: video_info = None # 从TikTok官方API获取部分视频数据 - tiktok_api_link = 'https://api.tiktokv.com/aweme/v1/aweme/detail/?aweme_id={}'.format( + tiktok_api_link = 'https://api-h2.tiktokv.com/aweme/v1/feed/?version_code=2613&aweme_id={}&device_type=Pixel%204'.format( video_id) print('正在请求API链接:{}'.format(tiktok_api_link)) response = requests.get(url=tiktok_api_link, headers=headers, proxies=self.proxies).text # 将API获取到的内容格式化为JSON result = json.loads(response) + # print(result) if 'image_post_info' in response: # 判断链接是图集链接 url_type = 'album' print('类型为图集/type album') # 视频标题 - album_title = result["aweme_detail"]["desc"] + album_title = result["aweme_list"][0]["desc"] # 视频作者昵称 - album_author_nickname = result["aweme_detail"]['author']["nickname"] + album_author_nickname = result["aweme_list"][0]['author']["nickname"] # 视频作者ID - album_author_id = result["aweme_detail"]['author']["unique_id"] + album_author_id = result["aweme_list"][0]['author']["unique_id"] # 上传时间戳 - album_create_time = result["aweme_detail"]['create_time'] + album_create_time = result["aweme_list"][0]['create_time'] # 视频ID - album_aweme_id = result["aweme_detail"]['statistics']['aweme_id'] + album_aweme_id = result["aweme_list"][0]['statistics']['aweme_id'] try: # 视频BGM标题 - album_music_title = result["aweme_detail"]['music']['title'] + album_music_title = result["aweme_list"][0]['music']['title'] # 视频BGM作者 - album_music_author = result["aweme_detail"]['music']['author'] + album_music_author = result["aweme_list"][0]['music']['author'] # 视频BGM ID - album_music_id = result["aweme_detail"]['music']['id'] + album_music_id = result["aweme_list"][0]['music']['id'] # 视频BGM链接 - album_music_url = result["aweme_detail"]['music']['play_url']['url_list'][0] + album_music_url = result["aweme_list"][0]['music']['play_url']['url_list'][0] except: album_music_title, album_music_author, album_music_id, album_music_url = "None", "None", "None", "None" # 评论数量 - album_comment_count = result["aweme_detail"]['statistics']['comment_count'] + album_comment_count = result["aweme_list"][0]['statistics']['comment_count'] # 获赞数量 - album_digg_count = result["aweme_detail"]['statistics']['digg_count'] + album_digg_count = result["aweme_list"][0]['statistics']['digg_count'] # 播放次数 - album_play_count = result["aweme_detail"]['statistics']['play_count'] + album_play_count = result["aweme_list"][0]['statistics']['play_count'] # 下载次数 - album_download_count = result["aweme_detail"]['statistics']['download_count'] + album_download_count = result["aweme_list"][0]['statistics']['download_count'] # 分享次数 - album_share_count = result["aweme_detail"]['statistics']['share_count'] + album_share_count = result["aweme_list"][0]['statistics']['share_count'] # 无水印图集 album_list = [] - for i in result["aweme_detail"]['image_post_info']['images']: + for i in result["aweme_list"][0]['image_post_info']['images']: album_list.append(i['display_image']['url_list'][0]) # 结束时间 end = time.time() @@ -430,53 +434,53 @@ class Scraper: url_type = 'video' print('类型为视频/type video') # 无水印视频链接 - nwm_video_url = result["aweme_detail"]["video"]["play_addr"]["url_list"][0] + nwm_video_url = result["aweme_list"][0]["video"]["play_addr"]["url_list"][0] try: # 有水印视频链接 - wm_video_url = result["aweme_detail"]["video"]['download_addr']['url_list'][0] + wm_video_url = result["aweme_list"][0]["video"]['download_addr']['url_list'][0] except Exception: # 有水印视频链接 wm_video_url = 'None' # 视频标题 - video_title = result["aweme_detail"]["desc"] + video_title = result["aweme_list"][0]["desc"] # 视频作者昵称 - video_author_nickname = result["aweme_detail"]['author']["nickname"] + video_author_nickname = result["aweme_list"][0]['author']["nickname"] # 视频作者ID - video_author_id = result["aweme_detail"]['author']["unique_id"] + video_author_id = result["aweme_list"][0]['author']["unique_id"] # 上传时间戳 - video_create_time = result["aweme_detail"]['create_time'] + video_create_time = result["aweme_list"][0]['create_time'] # 视频ID - video_aweme_id = result["aweme_detail"]['statistics']['aweme_id'] + video_aweme_id = result["aweme_list"][0]['statistics']['aweme_id'] try: # 视频BGM标题 - video_music_title = result["aweme_detail"]['music']['title'] + video_music_title = result["aweme_list"][0]['music']['title'] # 视频BGM作者 - video_music_author = result["aweme_detail"]['music']['author'] + video_music_author = result["aweme_list"][0]['music']['author'] # 视频BGM ID - video_music_id = result["aweme_detail"]['music']['id'] + video_music_id = result["aweme_list"][0]['music']['id'] # 视频BGM链接 - video_music_url = result["aweme_detail"]['music']['play_url']['url_list'][0] + video_music_url = result["aweme_list"][0]['music']['play_url']['url_list'][0] except: video_music_title, video_music_author, video_music_id, video_music_url = "None", "None", "None", "None" # 评论数量 - video_comment_count = result["aweme_detail"]['statistics']['comment_count'] + video_comment_count = result["aweme_list"][0]['statistics']['comment_count'] # 获赞数量 - video_digg_count = result["aweme_detail"]['statistics']['digg_count'] + video_digg_count = result["aweme_list"][0]['statistics']['digg_count'] # 播放次数 - video_play_count = result["aweme_detail"]['statistics']['play_count'] + video_play_count = result["aweme_list"][0]['statistics']['play_count'] # 下载次数 - video_download_count = result["aweme_detail"]['statistics']['download_count'] + video_download_count = result["aweme_list"][0]['statistics']['download_count'] # 分享次数 - video_share_count = result["aweme_detail"]['statistics']['share_count'] + video_share_count = result["aweme_list"][0]['statistics']['share_count'] # 视频封面 - video_cover = result["aweme_detail"]['video']['cover']['url_list'][0] + video_cover = result["aweme_list"][0]['video']['cover']['url_list'][0] # 视频动态封面 - video_dynamic_cover = result["aweme_detail"]['video']['dynamic_cover']['url_list'][0] + video_dynamic_cover = result["aweme_list"][0]['video']['dynamic_cover']['url_list'][0] # 视频原始封面 - video_origin_cover = result["aweme_detail"]['video']['origin_cover']['url_list'][0] + video_origin_cover = result["aweme_list"][0]['video']['origin_cover']['url_list'][0] # 将话题保存在列表中 video_hashtags = [] - for tag in result["aweme_detail"]['text_extra']: + for tag in result["aweme_list"][0]['text_extra']: if 'hashtag_name' in tag: video_hashtags.append(tag['hashtag_name']) else: