直接开始
1.随便找个录播点开,可以看到访问的url里面包含有视频id, 后面会用到
2.F12打开调试模式 ,切换到XHR,视频点击播放,可以看到是一段一段的ts文件,观察下访问的链接,中间有数值是在增加的
3.链接弄下来看看规律,可以看到ts文件前面是一个递增的数字,再_之前应该是10位时间戳,每次增加6秒
但是递增的起始值和结束值是未知的,不可能每次还要打开浏览器去拉进度条找起始值和结束值,而且时间戳应该也不是每次都是递增6秒,不太可靠,到这里想到应该是有接口请求获取这些数据的
1199536199401-1199536199401-0-2399072522258-10057-A-1586788055-1-imgplus/1642167042_174688.ts
1199536199401-1199536199401-0-2399072522258-10057-A-1586788055-1-imgplus/1642167048_174689.ts
1199536199401-1199536199401-0-2399072522258-10057-A-1586788055-1-imgplus/1642167054_174690.ts
1199536199401-1199536199401-0-2399072522258-10057-A-1586788055-1-imgplus/1642167060_174691.ts
1199536199401-1199536199401-0-2399072522258-10057-A-1586788055-1-imgplus/1642167066_174692.ts
1199536199401-1199536199401-0-2399072522258-10057-A-1586788055-1-imgplus/1642167072_174693.ts
1199536199401-1199536199401-0-2399072522258-10057-A-1586788055-1-imgplus/1642167078_174694.ts
1199536199401-1199536199401-0-2399072522258-10057-A-1586788055-1-imgplus/1642167084_174695.ts
F5刷新,搜索下这几个变动的数值,找到这个接口请求返回结果,就是对应的数据了
拿到的对应数据请求的url
https://v-replay.cdn.huya.com/record/huyalive/1199536199401-1199536199401-0-2399072522258-10057-A-1586788055-1-imgplus/2022-01-14-21:30:08_2022-01-15-01:07:07.m3u8?bitrate=0&client=81&definition=yuanhua&pid=1199536199401&scene=vodreplay&vid=641112835
但这个url还是构建还是比较麻烦,刚好找这个接口的时候,在别的接口看到别的接口好像有返回.m3u8这样的链接,然后搜索了下链接中间的“2022-01-14-21:30:08_2022-01-15-01:07:07.m3u8”
4.拿到这个url请求获取到“请求视频链接数据的url”
https://v-api-player-ssl.huya.com/?callback=jQuery112406694988512505824_1642474121474&r=vhuyaplay/video&vid=641112835&format=mp4,m3u8&_=1642474121488
#这个链式可以简化下
https://v-api-player-ssl.huya.com/?r=vhuyaplay/video&vid=641112835&format=mp4,m3u8&_=1642474121488
# vid=641112835 是视频id,最上面直接访问网页的url中就有包含
# _=1642474121488 是毫秒级的时间戳, round(time.time() * 1000) 就可以获取的到
def get_video_param_requests_url(vid):
url = f'https://v-api-player-ssl.huya.com/?r=vhuyaplay%2Fvideo&vid={vid}&format=mp4%2Cm3u8&_={round(time.time() * 1000)}'
r = session.get(url).json()
# https://v-replay.cdn.huya.com/record/huyalive/1199536199401-1199536199401-0-2399072522258-10057-A-1586788055-1-imgplus/2022-01-14-21:30:08_2022-01-15-01:07:07.m3u8?bitrate=0&client=81&definition=yuanhua&pid=1199536199401&scene=vodreplay&vid=641112835
return r['result']['items'][0]['transcode']['urls'][0]
5.接下来就用获取到的链接去获取视频链接的数据, 正则匹配出来
def get_video_url_param(param_url):
r = session.get(param_url)
param_list = re.findall('\d{10}_\d+.ts', r.text)
# param_list = ['1642167012_174683.ts', '1642167018_174684.ts', '1642167024_174685.ts',....]
return param_list
6.接下来把获取到的数据拼接成一个一个的url
# 这里就要用的我们前面找到的请求ts文件的地址了
# https://v-replay.cdn.huya.com/record/huyalive/1199536199401-1199536199401-0-2399072522258-10057-A-1586788055-1-imgplus/1642167018_174684.ts?bitrate=0&client=81&definition=yuanhua&pid=1199536199401&scene=vodreplay&vid=641112835
# 1199536199401-1199536199401-0-2399072522258-10057-A-1586788055-1-imgplus
# 这段内容观察是会根据视频ID变化的,也好办,我们前面获取请求获取“请求数据”的url里面就有包含,同一个视频id是不会变的
video_param_url = https://v-replay.cdn.huya.com/record/huyalive/1199536199401-1199536199401-0-2399072522258-10057-A-1586788055-1-imgplus/2022-01-14-21:30:08_2022-01-15-01:07:07.m3u8?bitrate=0&client=81&definition=yuanhua&pid=1199536199401&scene=vodreplay&vid=641112835
video_info = re.findall('http://v-replay-hw.cdn.huya.com/record/huyalive/(.+)/', video_param_url)
# bitrate=0&client=81&definition=yuanhua&pid=1199536199401&scene=vodreplay&vid=641112835
# 这段是可以精简掉的
def get_video_url(info, param_list):
url_to_list = []
for i in param_list:
url = f'http://v-replay-hw.cdn.huya.com/record/huyalive/{info}/{i}'
url_to_list.append(url)
# url_to_list = ['http://v-replay-hw.cdn.huya.com/record/huyalive/1199536199401-1199536199401-0-2399072522258-10057-A-1586788055-1-imgplus/1642167012_174683.ts',
# 'http://v-replay-hw.cdn.huya.com/record/huyalive/1199536199401-1199536199401-0-2399072522258-10057-A-1586788055-1-imgplus/1642167018_174684.ts',
# ...]
return url_to_list
7.链接获取到了,可以进行下载了,这里用了下异步协程,不然太慢了
async def download_video(video_url, save_path):
print(f'开始下载:{video_url}')
loops = asyncio.get_event_loop()
future = loops.run_in_executor(None, session.get, video_url)
response = await future
file_name = video_url.rsplit('/')[-1]
os.makedirs(f'{save_path}/ts', exist_ok=True)
with open(f'{save_path}/ts/{file_name}', mode='wb') as f:
f.write(response.content)
8.接下来就是把ts文件合并成mp4
def ts_to_mp4(vid, file_path):
ts_path = f'{file_path}/ts/'
mp4_path = f'{file_path}/mp4/'
os.makedirs(f'{mp4_path}', exist_ok=True)
files = os.listdir(ts_path)
for file in tqdm(files, desc="正在合并视频:"):
if os.path.exists(ts_path + file):
with open(ts_path + file, 'rb') as f1:
with open(mp4_path + f"{vid}.mp4", 'ab') as f2:
f2.write(f1.read())
else:
print("合并视频失败")
完整代码
# encoding:utf-8
"""
Created on 2022-01-16
@author: zhuo
"""
import asyncio
import os
import random
import re
import time
from requests_html import HTMLSession
from tqdm import tqdm
session = HTMLSession()
def get_video_param_requests_url(vid):
url = f'https://v-api-player-ssl.huya.com/?r=vhuyaplay%2Fvideo&vid={vid}&format=mp4%2Cm3u8&_={round(time.time() * 1000)}'
r = session.get(url).json()
return r['result']['items'][0]['transcode']['urls'][0]
def get_video_url_param(param_url):
r = session.get(param_url)
param_list = re.findall('\d{10}_\d+.ts', r.text)
return param_list
def get_video_url(info, param_list):
url_to_list = []
for i in param_list:
url = f'http://v-replay-hw.cdn.huya.com/record/huyalive/{info}/{i}'
url_to_list.append(url)
return url_to_list
def ts_to_mp4(vid, file_path):
ts_path = f'{file_path}/ts/'
mp4_path = f'{file_path}/mp4/'
os.makedirs(f'{mp4_path}', exist_ok=True)
files = os.listdir(ts_path)
for file in tqdm(files, desc="正在合并视频:"):
if os.path.exists(ts_path + file):
with open(ts_path + file, 'rb') as f1:
with open(mp4_path + f"{vid}.mp4", 'ab') as f2:
f2.write(f1.read())
else:
print("合并视频失败")
async def download_video(video_url, save_path):
print(f'开始下载:{video_url}')
loops = asyncio.get_event_loop()
future = loops.run_in_executor(None, session.get, video_url)
response = await future
file_name = video_url.rsplit('/')[-1]
os.makedirs(f'{save_path}/ts', exist_ok=True)
with open(f'{save_path}/ts/{file_name}', mode='wb') as f:
f.write(response.content)
def main(video_id, path):
times = 0
while times < 3:
video_param_url = get_video_param_requests_url(video_id)
video_info = re.findall('http://v-replay-hw.cdn.huya.com/record/huyalive/(.+)/', video_param_url)
if video_info:
video_url_param_list = get_video_url_param(video_param_url)
url_list = get_video_url(video_info[0], video_url_param_list)
tasks = [download_video(url, path) for url in url_list]
# loop = asyncio.get_event_loop()
# loop.run_until_complete(asyncio.wait(tasks))
asyncio.run(asyncio.wait(tasks))
ts_to_mp4(video_id, path)
print('执行完成')
break
else:
times += 1
print(f'链接ts参数获取失败 正在执行第{times}次重试...')
time.sleep(random.randint(2, 5))
if times == 3:
print('执行失败')
if __name__ == '__main__':
video_id = 641112835
path = f'D:/video/{video_id}'
main(video_id, path)