m3u8就不多介绍了,对一个m3u8链接发起请求,得到的结果大概就是下面这样,得到一串下载视频链接。m3u8视频文件有加密的,也有没加密的,下面这个就是加密的,图中右侧第6行显示通过AES加密,大多都采用这种加密方法,后面跟着的是密钥链接,可以获取密钥key。解密的另一个重要参数是iv值,也是从m3u8文件中获取,如果没有,比如下面的这张图,就没有iv字段,就用EXT-X-MEDIA-SEQUENCE的值作为iv值,也下图中其值为‘0’,使用大端字节序,如果长度不足,就往左填充 0 直到序列号满足128位,一般是全0。windows下对AES解密,需要安装pycryptodome、pycrypto两个模块。
如果安装了Anaconda,建议通过下面的方式安装
conda install pycryptodome
conda install pycrypto
没安装的话,就把conda替换为pip
因为请求的链接很多,费时的IO操作,因此采用协程来提高速度,我这里采用的是基于c实现的gevent高性能的并发网络库,真的就比python 的那个async好用太多!!async用起来总是别扭
主要步骤:1、根据m3u8链接获取ts视频下载链接,构建下载列表;2、创建协程任务池,并发请求;3、合并下载的大量ts视频片段(少则几百个,多则几千个);4、删除ts视频片段。
下面提供的程序,加密的、没加密的都能处理,还可以去广告,有些视频会在视频中插入广告,广告部分一般不加密,因此会出现分割行,如下面所示,特征是加密方法为NONE,即METHOD=NONE,如第一个红箭头所指,下面的几个链接就是广告,广告之后就是正常的视频,这时候又开始加密了,如第二个箭头所指,此时的加密方法就变成了METHOD=AES-128,而且可以看到广告的请求链接不同,可以根据这两个特点将其剔除
单进程版
from gevent import monkey; monkey.patch_all()
from gevent.pool import Pool
from Crypto.Cipher import AES
import requests
import random
import os
class m3u8_downloader(object):
def __init__(self):
self.headers = {
'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36'
}
self.user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36 Edg/92.0.902.84',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3875.400 QQBrowser/10.8.4492.400'
]
self. Key = None
self.iv = b'0000000000000000'
self.cipher = None
self.failed_urls = [] #缓存访问失败链接,构建新的请求列表
self.num = 0 #视频总数
self.cur = 0 #当前已下载视频数
def download(self, url):
name, video_url = url.split('@') #根据之前的人为标记,分隔出名字、链接
try:
requests.packages.urllib3.disable_warnings()
self.headers['User-Agent'] = random.choice(self.user_agents)
content = requests.get(video_url, stream=True, headers=self.headers, verify=False).content
except:
self.failed_urls.append(url)
return
if self.key:
try:
content = self.cipher.decrypt(content) #AES解密
except:
return
with open(os.path.join(ts_path, name), 'wb') as f:
f.write(content)
self.cur += 1
print('\r', '下载进度:%d/%d' % (self.cur, self.num), end='', flush=True)
def get_download_list(self): #构建下载链接列表
response = requests.get(m3u8_url, headers=self.headers)
response = response.text.split('\n')
url_list = []
count, flag = 1, 'NONE'
# count用于重新计数并作为视频保存的名字,以便后面可以正确有序的合并视频
# flag主要用来剔除广告,广告链接和正常视频链接有差异
split = m3u8_url.split("/")
prefix = split[0] + "//" + split[2]
for each in response:
if 'EXT-X-MEDIA-SEQUENCE' in each:
self.iv = each.split(':')[1]
self.iv = bytes('0' * (16 - len(self.iv)) + self.iv, encoding='utf-8')
print('iv值:', self.iv)
if 'AES' in each and "key" in each and not self.key:
key_url = each.split('"')[1]
if len(key_url):
key_url = key_url if "https" in key_url else prefix + key_url
self.headers['User-Agent'] = random.choice(self.user_agents)
self.key = requests.get(key_url, headers=self.headers).text
self.cipher = AES.new(self.key.encode('utf-8'), AES.MODE_CBC, self.iv)
flag = key_url.split('key')[0]
print('AES Key链接:', key_url)
print('该视频文件被加密,AES加密密钥:', self.key)
url = each.replace("jpg", "ts") if each[-3:] == "jpg" else each
if url[-2:] == 'ts':
if flag and flag in url: # 视频被加密 # 去广告
url_list.append(str(count) + '.ts@' + url) if "https" in url else \
url_list.append(str(count) + '.ts@' + prefix + url)
else: # 视频未加密
url_list.append(str(count) + '.ts@' + url) if "https" in url else \
url_list.append(str(count) + '.ts@' + prefix + url)
count += 1
self.num = len(url_list)
print('共获取视频文件数:', self.num)
return url_list
def merge_video(self): #合并视频
video_list = os.listdir(ts_path)
total = len(video_list)
video_list = sorted(video_list, key=lambda x: int(x[:-3]))
count = 0
with open(os.path.join(save_path, video_name), 'wb+') as f:
for video in video_list:
count += 1
print('\r', '合并进度:%d/%d' % (count, total), end='', flush=True)
f.write(open(os.path.join(ts_path, video), 'rb').read())
print('\n视频合并完成!')
def delete_ts(self): #删除缓存的ts随碎片视频
for each in os.listdir(ts_path):
os.remove(os.path.join(ts_path, each))
print('缓存ts视频碎片文件删除完成!')
def creat_pool(self, url_list): #创建任务池
for i in range(0, len(url_list), mcn):
p = Pool(mcn)
p.map(self.download, url_list[i:i+mcn])
def main(self):
url_list = self.get_download_list()
while url_list:
self.creat_pool(url_list)
url_list, self.failed_urls = self.failed_urls, []
if url_list: #如果有下载失败的,循环请求
print('\n失败链接:', url_list)
time.sleep(1.5)
print('\n所有视频文件下载完成!')
print('开始合并视频......')
self.merge_video()
self.delete_ts()
def check_path_is_exist(self, path):
if not os.path.exists(path):
os.mkdir(path)
if __name__ == '__main__':
ts_path = 'D:\\python\\ts' #保存ts视频流的路径
save_path = 'D:\\python\\电影' #保存合并后的视频路径
video_name = '你是我的荣耀.ts' #合并后的视频名字
m3u8_url = 'https://v10.dious.cc/20210726/S5EqrJub/1000kb/hls/index.m3u8'
mcn = 100 #最大并发数,可调整
md = m3u8_downloader()
md.check_path_is_exist(ts_path)
md.check_path_is_exist(save_path)
md.main()
上面的分析基于这个视频网站:旋风视频
以电视剧“你是我的荣耀”某一集为例,ts视频流合并之前,如下图所示,有几百个视频片段
通过合并就可以得到没有广告的完整一集了
多进程版
import gevent
from gevent import monkey;monkey.patch_socket();monkey.patch_ssl()
from multiprocessing import Pool
from multiprocessing import Manager
import requests
import time
from collections import deque
from functools import wraps
from multiprocessing import cpu_count
from Crypto.Cipher import AES
import os
import random
class m3u8_downloader(object):
def __init__(self, ts_path, save_path, video_name):
self.headers = {
'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36',
}
self.user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36 Edg/92.0.902.84',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3875.400 QQBrowser/10.8.4492.400',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11'
]
self.key = None
self.iv = b'0000000000000000'
self.cipher = None
self.failed_urls = Manager().list() # 多进程共享列表
self.num = 0
self.cur = Manager().Value('i', 0)
self.ts_path = ts_path
self.save_path = save_path
self.video_name = video_name
def download(self, url):
name, video_url = url.split('@')
try:
requests.packages.urllib3.disable_warnings()
self.headers['User-Agent'] = random.choice(self.user_agents)
content = requests.get(video_url, stream=True, headers=self.headers, verify=False, timeout=5).content
except:
self.failed_urls.append(url)
return
if self.key:
try:
content = self.cipher.decrypt(content)
except:
return
with open(os.path.join(self.ts_path, name), 'wb') as f:
f.write(content)
self.cur.value += 1
print('\r', '下载进度:%d/%d' % (self.cur.value, self.num), end='', flush=True)
def get_download_list(self):
self.headers['User-Agent'] = random.choice(self.user_agents)
response = requests.get(m3u8_url, headers=self.headers)
response = response.text.split('\n')
url_list = []
count, flag = 1, None
split = m3u8_url.split("/")
prefix = split[0] + "//" + split[2]
for each in response:
if 'EXT-X-MEDIA-SEQUENCE' in each:
self.iv = each.split(':')[1]
self.iv = bytes('0' * (16 - len(self.iv)) + self.iv, encoding='utf-8')
print('iv值:', self.iv)
if 'AES' in each and "key" in each and not self.key:
key_url = each.split('"')[1]
if len(key_url):
key_url = key_url if "https" in key_url else prefix + key_url
self.headers['User-Agent'] = random.choice(self.user_agents)
self.key = requests.get(key_url, headers=self.headers).text
self.cipher = AES.new(self.key.encode('utf-8'), AES.MODE_CBC, self.iv)
flag = key_url.split('key')[0]
print('AES Key链接:', key_url)
print('该视频文件被加密,AES加密密钥:', self.key)
url = each.replace("jpg", "ts") if each[-3:] == "jpg" else each
if url[-2:] == 'ts':
if flag and flag in url: # 视频被加密 # 去广告
url_list.append(str(count) + '.ts@' + url) if "https" in url else \
url_list.append(str(count) + '.ts@' + prefix + url)
else: # 视频未加密
url_list.append(str(count) + '.ts@' + url) if "https" in url else \
url_list.append(str(count) + '.ts@' + prefix + url)
count += 1
self.num = len(url_list)
print('共获取视频文件数:', self.num)
return url_list
def merge_video(self):
video_list = os.listdir(self.ts_path)
total = len(video_list)
video_list = sorted(video_list, key=lambda x: int(x[:-3]))
count = 0
with open(os.path.join(self.save_path, self.video_name), 'wb+') as f:
for video in video_list:
count += 1
print('\r', '合并进度:%d/%d' % (count, total), end='', flush=True)
f.write(open(os.path.join(self.ts_path, video), 'rb').read())
print('\n视频合并完成!')
def delete_ts(self):
for each in os.listdir(self.ts_path):
os.remove(os.path.join(self.ts_path, each))
print('缓存ts视频碎片文件删除完成!')
def creat_pool(self, *url_list):
spwans = [gevent.spawn(self.download, url) for url in url_list]
gevent.joinall(spwans)
def main(self):
url_list = self.get_download_list()
num_process_pools = cpu_count() // 2
while url_list:
num_urls = len(url_list)
step = num_urls // num_process_pools
process_pool = Pool(num_process_pools) if step > 1 else Pool()
if num_urls <= num_process_pools:
process_pool.apply_async(self.creat_pool, url_list)
else:
for i in range(0, num_urls, step): #进程池
process_pool.apply_async(self.creat_pool, url_list[i:i+step])
process_pool.close()
process_pool.join()
url_list, self.failed_urls = self.failed_urls[:], Manager().list()
if url_list:
time.sleep(random.choice([1, 1.5, 1.8, 1.3]))
print('\n所有视频文件下载完成!')
print('开始合并视频......')
self.merge_video()
self.delete_ts()
def check_path_is_exist(self):
if not os.path.exists(self.ts_path):
os.mkdir(self.ts_path)
if not os.path.exists(self.save_path):
os.mkdir(self.save_path)
if __name__ == '__main__':
m3u8_url = "https://new.qqaku.com/20220705/0Vv86Cxv/1100kb/hls/index.m3u8"
mcn = 100 #最大并发数
ts_path = 'D:\\moives\\ts'
save_path = 'D:\\moives\\电影'
video_name = '星汉灿烂2.mp4'
md = m3u8_downloader(ts_path, save_path, video_name)
md.check_path_is_exist()
md.main()