2024-08-20 09:18:19 +08:00

319 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import asyncio # 异步I/O
import os # 系统操作
import time # 时间操作
import yaml # 配置文件
# 基础爬虫客户端和哔哩哔哩API端点
from crawlers.base_crawler import BaseCrawler
from crawlers.bilibili.web.endpoints import BilibiliAPIEndpoints
# 哔哩哔哩工具类
from crawlers.bilibili.web.utils import EndpointModels, bv2av, ResponseAnalyzer
# 配置文件路径
path = os.path.abspath(os.path.dirname(__file__))
# 读取配置文件
with open(f"{path}/config.yaml", "r", encoding="utf-8") as f:
config = yaml.safe_load(f)
class BilibiliWebCrawler:
# 从配置文件读取哔哩哔哩请求头
async def get_bilibili_headers(self):
bili_config = config['TokenManager']['bilibili']
kwargs = {
"headers": {
"accept-language": bili_config["headers"]["accept-language"],
"origin": bili_config["headers"]["origin"],
"referer": bili_config["headers"]["referer"],
"user-agent": bili_config["headers"]["user-agent"],
"cookie": bili_config["headers"]["cookie"],
},
"proxies": {"http://": bili_config["proxies"]["http"], "https://": bili_config["proxies"]["https"]},
}
return kwargs
"-------------------------------------------------------handler接口列表-------------------------------------------------------"
# 获取单个视频详情信息
async def fetch_one_video(self, bv_id: str) -> dict:
# 获取请求头信息
kwargs = await self.get_bilibili_headers()
# 创建基础爬虫对象
base_crawler = BaseCrawler(proxies=kwargs["proxies"], crawler_headers=kwargs["headers"])
async with base_crawler as crawler:
# 创建请求endpoint
endpoint = f"{BilibiliAPIEndpoints.POST_DETAIL}?bvid={bv_id}"
# 发送请求,获取请求响应结果
response = await crawler.fetch_get_json(endpoint)
return response
# 获取用户发布视频作品数据
async def fetch_user_post_videos(self, uid: str, pn: int) -> dict:
"""
:param uid: 用户uid
:param pn: 页码
:return:
"""
# 获取请求头信息
kwargs = await self.get_bilibili_headers()
# 创建基础爬虫对象
base_crawler = BaseCrawler(proxies=kwargs["proxies"], crawler_headers=kwargs["headers"])
async with base_crawler as crawler:
# 创建请求endpoint
endpoint = await EndpointModels().user_post_videos_endpoint(uid=uid, pn=pn)
# 发送请求,获取请求响应结果
response = await crawler.fetch_get_json(endpoint)
return response
# 获取用户所有收藏夹信息
async def fetch_collect_folders(self, uid: str) -> dict:
# 获取请求头信息
kwargs = await self.get_bilibili_headers()
# 创建基础爬虫对象
base_crawler = BaseCrawler(proxies=kwargs["proxies"], crawler_headers=kwargs["headers"])
async with base_crawler as crawler:
# 创建请求endpoint
endpoint = f"{BilibiliAPIEndpoints.COLLECT_FOLDERS}?up_mid={uid}"
# 发送请求,获取请求响应结果
response = await crawler.fetch_get_json(endpoint)
# 分析响应结果
result_dict = await ResponseAnalyzer.collect_folders_analyze(response=response)
return result_dict
# 获取指定收藏夹内视频数据
async def fetch_folder_videos(self, folder_id: str, pn: int) -> dict:
"""
:param folder_id: 收藏夹id-- 可从<获取用户所有收藏夹信息>获得
:param pn: 页码
:return:
"""
# 获取请求头信息
kwargs = await self.get_bilibili_headers()
# 创建基础爬虫对象
base_crawler = BaseCrawler(proxies=kwargs["proxies"], crawler_headers=kwargs["headers"])
# 发送请求,获取请求响应结果
async with base_crawler as crawler:
endpoint = f"{BilibiliAPIEndpoints.COLLECT_VIDEOS}?media_id={folder_id}&pn={pn}&ps=20&keyword=&order=mtime&type=0&tid=0&platform=web"
response = await crawler.fetch_get_json(endpoint)
return response
# 获取指定用户的信息
async def fetch_user_profile(self, uid: str) -> dict:
# 获取请求头信息
kwargs = await self.get_bilibili_headers()
# 创建基础爬虫对象
base_crawler = BaseCrawler(proxies=kwargs["proxies"], crawler_headers=kwargs["headers"])
async with base_crawler as crawler:
# 创建请求endpoint
endpoint = await EndpointModels().user_profile_endpoint(uid=uid)
response = await crawler.fetch_get_json(endpoint=endpoint)
return response
# 获取综合热门视频信息
async def fetch_com_popular(self, pn: int) -> dict:
# 获取请求头信息
kwargs = await self.get_bilibili_headers()
# 创建基础爬虫对象
base_crawler = BaseCrawler(proxies=kwargs["proxies"], crawler_headers=kwargs["headers"])
async with base_crawler as crawler:
# 创建请求endpoint
endpoint = await EndpointModels().com_popular_endpoint(pn=pn)
response = await crawler.fetch_get_json(endpoint=endpoint)
return response
# 获取指定视频的评论
async def fetch_video_comments(self, bv_id: str, pn: int) -> dict:
# 评论排序 -- 1:按点赞数排序. 0:按时间顺序排序
sort = 1
# 获取请求头信息
kwargs = await self.get_bilibili_headers()
# 创建基础爬虫对象
base_crawler = BaseCrawler(proxies=kwargs["proxies"], crawler_headers=kwargs["headers"])
async with base_crawler as crawler:
# 创建请求endpoint
endpoint = f"{BilibiliAPIEndpoints.VIDEO_COMMENTS}?type=1&oid={bv_id}&sort={sort}&nohot=0&ps=20&pn={pn}"
# 发送请求,获取请求响应结果
response = await crawler.fetch_get_json(endpoint)
return response
# 获取视频下指定评论的回复
async def fetch_comment_reply(self, bv_id: str, pn: int, rpid: str) -> dict:
"""
:param bv_id: 目标视频bv号
:param pn: 页码
:param rpid: 目标评论id可通过fetch_video_comments获得
:return:
"""
# 获取请求头信息
kwargs = await self.get_bilibili_headers()
# 创建基础爬虫对象
base_crawler = BaseCrawler(proxies=kwargs["proxies"], crawler_headers=kwargs["headers"])
async with base_crawler as crawler:
# 创建请求endpoint
endpoint = f"{BilibiliAPIEndpoints.COMMENT_REPLY}?type=1&oid={bv_id}&root={rpid}&&ps=20&pn={pn}"
# 发送请求,获取请求响应结果
response = await crawler.fetch_get_json(endpoint)
return response
# 获取指定用户动态
async def fetch_user_dynamic(self, uid: str, offset: str) -> dict:
# 获取请求头信息
kwargs = await self.get_bilibili_headers()
# 创建基础爬虫对象
base_crawler = BaseCrawler(proxies=kwargs["proxies"], crawler_headers=kwargs["headers"])
async with base_crawler as crawler:
# 创建请求endpoint
endpoint = await EndpointModels().user_dynamic_endpoint(uid=uid, offset=offset)
# 发送请求,获取请求响应结果
response = await crawler.fetch_get_json(endpoint)
return response
# 获取指定直播间信息
async def fetch_live_room_detail(self, room_id: str) -> dict:
# 获取请求头信息
kwargs = await self.get_bilibili_headers()
# 创建基础爬虫对象
base_crawler = BaseCrawler(proxies=kwargs["proxies"], crawler_headers=kwargs["headers"])
async with base_crawler as crawler:
# 创建请求endpoint
endpoint = f"{BilibiliAPIEndpoints.LIVEROOM_DETAIL}?room_id={room_id}"
# 发送请求,获取请求响应结果
response = await crawler.fetch_get_json(endpoint)
return response
# 获取指定直播间视频流
# async def fetch_live_videos(self, room_id: str) -> dict:
# # 获取请求头信息
# kwargs = await self.get_bilibili_headers()
# # 创建基础爬虫对象
# base_crawler = BaseCrawler(proxies=kwargs["proxies"], crawler_headers=kwargs["headers"])
# async with base_crawler as crawler:
# # 创建请求endpoint
# endpoint = f"{BilibiliAPIEndpoints.LIVE_VIDEOS}?cid={room_id}&quality=4"
# # 发送请求,获取请求响应结果
# response = await crawler.fetch_get_json(endpoint)
# return response
"-------------------------------------------------------utils接口列表-------------------------------------------------------"
# 通过bv号获得视频aid号
async def get_aid(self, bv_id: str) -> int:
aid = await bv2av(bv_id=bv_id)
return aid
# 获取所有直播分区列表
async def fetch_all_live_areas(self) -> dict:
# 获取请求头信息
kwargs = await self.get_bilibili_headers()
# 创建基础爬虫对象
base_crawler = BaseCrawler(proxies=kwargs["proxies"], crawler_headers=kwargs["headers"])
async with base_crawler as crawler:
# 创建请求endpoint
endpoint = BilibiliAPIEndpoints.LIVE_AREAS
# 发送请求,获取请求响应结果
response = await crawler.fetch_get_json(endpoint)
return response
# 根据uid生成wts及其对应w_rid参数(包含dm_img_inter参数)
# (仅示例 不同接口所需要传进去的参数不同)(待改进)
async def uid_to_wrid(self, uid: str) -> dict:
result = await EndpointModels().get_wrid_wts_by_uid(uid=uid)
return result
"-------------------------------------------------------main-------------------------------------------------------"
async def main(self):
"-------------------------------------------------------handler接口列表-------------------------------------------------------"
# 获取单个作品数据
# bv_id = 'BV1M1421t7hT'
# result = await self.fetch_one_video(bv_id=bv_id)
# print(result)
# 获取用户发布作品数据
# uid = '178360345'
# pn = 1
# result = await self.fetch_user_post_videos(uid=uid, pn=pn)
# print(result)
# 获取用户所有收藏夹信息
# uid = '178360345'
# reslut = await self.fetch_collect_folders(uid=uid)
# print(reslut)
# 获取用户指定收藏夹内视频数据
# folder_id = '1756059545' # 收藏夹id可从<获取用户所有收藏夹信息>获得
# pn = 1
# result = await self.fetch_folder_videos(folder_id=folder_id, pn=pn)
# print(result)
# 获取指定用户的信息
# uid = '178360345'
# result = await self.fetch_user_profile(uid=uid)
# print(result)
# 获取综合热门信息
# pn = 1 # 页码
# result = await self.fetch_com_popular(pn=pn)
# print(result)
# 获取指定视频的评论(不登录只能获取一页的评论)
# bv_id = "BV1M1421t7hT"
# pn = 1
# result = await self.fetch_video_comments(bv_id=bv_id, pn=pn)
# print(result)
# 获取视频下指定评论的回复(不登录只能获取一页的评论)
# bv_id = "BV1M1421t7hT"
# rpid = "237109455120"
# pn = 1
# result = await self.fetch_comment_reply(bv_id=bv_id, pn=pn, rpid=rpid)
# print(result)
# 获取指定用户动态
# uid = "16015678"
# offset = "953154282154098691" # 翻页索引,为空即从最新动态开始,可从获得到的动态数据里面获得
# result = await self.fetch_user_dynamic(uid=uid, offset=offset)
# print(result)
# 获取指定直播间信息
# room_id = "22816111"
# result = await self.fetch_live_room_detail(room_id=room_id)
# print(result)
# 获取直播间视频流
# room_id = "22816111"
# result = await self.fetch_user_live_videos_by_room_id(room_id=room_id)
# print(result)
"-------------------------------------------------------utils接口列表-------------------------------------------------------"
# 通过bv号获得视频aid号
# bv_id = 'BV1M1421t7hT'
# aid = await self.get_aid(bv_id=bv_id)
# print(aid)
# 获取所有直播分区列表
# result = await self.fetch_all_live_areas()
# print(result)
# 根据uid生成wts及其对应w_rid参数(包含dm_img_inter参数)
# (仅示例 不同接口所需要传进去的参数不同)(待改进)
# uid = '178360345'
# w_rid = await self.uid_to_wrid(uid=uid)
# print(w_rid)
if __name__ == '__main__':
# 初始化
BilibiliWebCrawler = BilibiliWebCrawler()
# 开始时间
start = time.time()
asyncio.run(BilibiliWebCrawler.main())
# 结束时间
end = time.time()
print(f"耗时:{end - start}")