教你如何下载斗罗大陆
仅供学习使用
F12获取m3u8文件链接
(有时这个文件会找不到,不知道为啥)
推荐用这个url,py代码会提取vurl
Python,给爷爬
import requests
import re
import os
import urllib.parse
import time
start_time = time.time()
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
}
m3u8_url = input("m3u8_url:")
# url解码
m3u8_url = urllib.parse.unquote(m3u8_url)
r_url = '.*&vurl=(.*ver=4)'
url_list = re.findall(r_url,m3u8_url)
if len(url_list) != 0:
m3u8_url = url_list[0]
m3u8 = requests.get(m3u8_url,headers = headers).text
r_m3u8 = ',\n(.*?)#'
# re.S整体匹配
ts_all = re.findall(r_m3u8,m3u8,re.S)
print('共 %d 个ts文件'%len(ts_all))
r_ts = '(.*)/.*'
ts_1 = re.findall(r_ts,m3u8_url)[0]
num = 0
if not os.path.exists("./ts"):
os.mkdir("./ts")
for ts_2 in ts_all:
ts_url = ts_1 + '/' + ts_2
ts = requests.get(ts_url,headers = headers).content
with open('./ts/%d.ts'%num,'wb') as fp:
fp.write(ts)
print('%d.ts save'%num)
num += 1
end_time = time.time()
print('下载完成,总耗时:',end_time-start_time)
bat合并ts文件
cd ts
set /p n=起始数字:
set /p end=结束数字:
copy %n%.ts out.ts
set num=%n%-%end%
:home
set /a n+=1
echo Y | copy /b out.ts+%n%.ts temp.ts && move /y temp.ts out.ts
if not %n%==%end% goto home
move /y out.ts ../out_%num%.mp4
pause
默认720p,更高画质可能需要会员的cookie
更新
加入协程函数,但是不太稳定,有的ts会卡死
等我以后再优化吧
问题应该和tasks缓存有关,代码结束后关闭即可
for task in asyncio.Task.all_tasks():
task.cancel()
loop.close()
关闭也不行,有时还是会卡
完整协程代码(已废弃,最新代码在下面)
import requests
import re
import os
import urllib.parse
import asyncio
import aiohttp
import time
start_time = time.time()
error = []
async def get(num,url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
}
print('%d.ts save start'%num)
try:
async with aiohttp.ClientSession() as session:
async with await session.get(url,headers = headers) as response:
ts = await response.read()
with open('./ts/%d.ts'%num,'wb') as fp:
fp.write(ts)
print('%d.ts save finish'%num)
except:
print('%d.ts save error'%num)
error.append([num,url])
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
}
m3u8_url = input("m3u8_url:")
# url解码
m3u8_url = urllib.parse.unquote(m3u8_url)
r_url = '.*&vurl=(.*ver=4)'
url_list = re.findall(r_url,m3u8_url)
if len(url_list) != 0:
m3u8_url = url_list[0]
m3u8 = requests.get(m3u8_url,headers = headers).text
r_m3u8 = ',\n(.*?)#'
# re.S整体匹配
ts_all = re.findall(r_m3u8,m3u8,re.S)
print('共 %d 个ts文件'%len(ts_all))
r_ts = '(.*)/.*'
ts_1 = re.findall(r_ts,m3u8_url)[0]
num = 0
tasks = []
if not os.path.exists("./ts"):
os.mkdir("./ts")
for ts_2 in ts_all:
ts_url = ts_1 + '/' + ts_2
c = get(num,ts_url)
task = asyncio.ensure_future(c)
tasks.append(task)
num += 1
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
# 关闭全部任务,防止缓存的tasks出错
for task in asyncio.Task.all_tasks():
task.cancel()
loop.close()
# 基本不会有失败,留着备用了
if len(error) == 0:
print('全部下载成功')
else:
print('失败 %d 次\n处理下载失败的内容'%len(errror))
for url in error:
num = url[0]
ts = requests.get(url[1],headers = headers).content
with open('./ts/%d.ts'%num,'wb') as fp:
fp.write(ts)
print('%d.ts save'%num)
end_time = time.time()
print('下载完成,总耗时:',end_time-start_time)
协程的速度还是很快的(左普通下载,右协程下载)
也许还存在问题,以后再说吧
想了个折中的办法
设置了每次开始的任务数量,循环调用协程函数,防止等待超时
暂时情况稳定,不会卡
最新协程代码
import requests
import re
import os
import urllib.parse
import asyncio
import aiohttp
import time
start_time = time.time()
error = []
async def get(num,url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
}
print('%d.ts save start'%num)
try:
async with aiohttp.ClientSession() as session:
async with await session.get(url,headers = headers) as response:
ts = await response.read()
with open('./ts/%d.ts'%num,'wb') as fp:
fp.write(ts)
print('%d.ts save finish'%num)
except:
print('%d.ts save error'%num)
error.append([num,url])
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
}
m3u8_url = input("m3u8_url:")
# url解码
m3u8_url = urllib.parse.unquote(m3u8_url)
r_url = '.*&vurl=(.*ver=4)'
url_list = re.findall(r_url,m3u8_url)
if len(url_list) != 0:
m3u8_url = url_list[0]
m3u8 = requests.get(m3u8_url,headers = headers).text
r_m3u8 = ',\n(.*?)#'
# re.S整体匹配
ts_all = re.findall(r_m3u8,m3u8,re.S)
print('共 %d 个ts文件'%len(ts_all))
r_ts = '(.*)/.*'
ts_1 = re.findall(r_ts,m3u8_url)[0]
num = 0
tasks = []
if not os.path.exists("./ts"):
os.mkdir("./ts")
# 控制每次开始的任务数量,防止等待超时
max_link = 20
t = len(ts_all) // max_link
if len(ts_all) % max_link != 0:
t += 1
for i in range(1,t+1):
print('第 %d 个任务'%i)
while num < max_link*i and num < len(ts_all):
ts_url = ts_1 + '/' + ts_all[num]
c = get(num,ts_url)
task = asyncio.ensure_future(c)
tasks.append(task)
num += 1
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
# 取消全部任务
for task in asyncio.Task.all_tasks():
task.cancel()
loop.close()
# 基本不会有失败,留着备用了
if len(error) == 0:
print('全部下载成功')
else:
print('失败 %d 次\n处理下载失败的内容'%len(errror))
for url in error:
num = url[0]
ts = requests.get(url[1],headers = headers).content
with open('./ts/%d.ts'%num,'wb') as fp:
fp.write(ts)
print('%d.ts save'%num)
end_time = time.time()
print('下载完成,总耗时:',end_time-start_time)