🐛 Fix TikTok method bug

This commit is contained in:
Evil0ctal 2022-08-28 18:23:38 -07:00 committed by GitHub
parent e84d0e3a34
commit 55519d5f54
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -2,10 +2,11 @@
# -*- encoding: utf-8 -*- # -*- encoding: utf-8 -*-
# @Author: https://github.com/Evil0ctal/ # @Author: https://github.com/Evil0ctal/
# @Time: 2021/11/06 # @Time: 2021/11/06
# @Update: 2022/08/08 # @Update: 2022/08/28
# @Function: # @Function:
# 核心代码估值1块(๑•̀ㅂ•́)و✧ # 核心代码估值1块(๑•̀ㅂ•́)و✧
# 用于爬取Douyin/TikTok数据并以字典形式返回。 # 用于爬取Douyin/TikTok数据并以字典形式返回。
# input link, output dictionary.
import re import re
@ -17,8 +18,11 @@ from tenacity import *
class Scraper: class Scraper:
""" """
Scraper.douyin():抖音视频/图集解析返回字典 Scraper.douyin(link):
Scraper.tiktok():TikTok视频解析返回字典 输入参数为抖音视频/图集链接完成解析后返回字典
Scraper.tiktok(link):
输入参数为TikTok视频/图集链接完成解析后返回字典
""" """
def __init__(self): def __init__(self):
@ -305,7 +309,6 @@ class Scraper:
start = time.time() start = time.time()
# 校验TikTok链接 # 校验TikTok链接
if '@' in original_url: if '@' in original_url:
original_url = original_url
print("目标链接: ", original_url) print("目标链接: ", original_url)
else: else:
# 从请求头中获取原始链接 # 从请求头中获取原始链接
@ -321,10 +324,10 @@ class Scraper:
# 获取视频ID # 获取视频ID
video_id = re.findall('/video/(\d+)?', original_url)[0] video_id = re.findall('/video/(\d+)?', original_url)[0]
print('获取到的TikTok视频ID是{}'.format(video_id)) print('获取到的TikTok视频ID是{}'.format(video_id))
# 尝试从TikTok网页获取部分视频数据,失败后判断为图集 # 尝试从TikTok网页获取部分视频数据
try: try:
tiktok_headers = self.tiktok_headers tiktok_headers = self.tiktok_headers
html = requests.get(url=original_url, headers=tiktok_headers, proxies=self.proxies) html = requests.get(url=original_url, headers=tiktok_headers, proxies=self.proxies, timeout=1)
# 正则检索网页中存在的JSON信息 # 正则检索网页中存在的JSON信息
resp = re.search('"ItemModule":{(.*)},"UserModule":', html.text).group(1) resp = re.search('"ItemModule":{(.*)},"UserModule":', html.text).group(1)
resp_info = ('{"ItemModule":{' + resp + '}}') resp_info = ('{"ItemModule":{' + resp + '}}')
@ -343,7 +346,7 @@ class Scraper:
if 'image_post_info' in response: if 'image_post_info' in response:
# 判断链接是图集链接 # 判断链接是图集链接
url_type = 'album' url_type = 'album'
print('类型为图集') print('类型为图集/type album')
# 视频标题 # 视频标题
album_title = result["aweme_detail"]["desc"] album_title = result["aweme_detail"]["desc"]
# 视频作者昵称 # 视频作者昵称
@ -411,7 +414,7 @@ class Scraper:
else: else:
# 类型为视频 # 类型为视频
url_type = 'video' url_type = 'video'
print('类型为视频') print('类型为视频/type video')
# 无水印视频链接 # 无水印视频链接
nwm_video_url = result["aweme_detail"]["video"]["play_addr"]["url_list"][0] nwm_video_url = result["aweme_detail"]["video"]["play_addr"]["url_list"][0]
try: try: