通过网址用python抓取某些网站的m3u8视频

fsyysf

已于 2024-09-27 14:42:05 修改

阅读量133

点赞数

文章标签：音视频 python

于 2024-09-27 14:40:58 首次发布

本文链接：https://blog.csdn.net/fsyysf/article/details/142591790

版权

之前通过抓流获取某些网站播放视频的m3u8路径，然后把里面的视频拉下来合并，总觉得不方便，然后尝试了从网页解析m3u8的路径，后来发现m3u8还有个套娃，进行二次解析m3u8，最后成功拉下视频

#m3u8.py
import sys
import requests
import os
import re

def get_m3u8_url(url):
    response = requests.get(url)
    web_file = 'web.txt'
    file = open(web_file, 'w', encoding='utf-8')
    if response.status_code == 200:
        content = response.text  # 获取网页内容作为字符串
        content=content.replace('\/','/')
        file.write(content)
        pattern = r'(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+.m3u8)'
        m3u8_links = re.findall(pattern,content)
        return m3u8_links[0]

def get_m3u8_url2(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0',
        'Referer': 'https://www.baidu.com/'
    }
    url_file = requests.get(url, headers=headers).text
    for line in url_file.split('\n'):
        if not line.startswith('#') and not line=='' and line.split('.')[-1]=='m3u8':
            return os.path.dirname(url)+'/'+line
    return url

if len(sys.argv)<=1:
    print('请提供url参数。')
    exit()

url = sys.argv[1]
url_type=url.split('.')[-1]
if url_type=='html' or url_type=='htm':
    url=get_m3u8_url(url)
    url=get_m3u8_url2(url)
    print (url)
elif not url_type=='m3u8':
    print('url有误，非m3u8:', url.split('.')[-1])
    exit()

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0',
    'Referer': 'https://www.baidu.com/'
}

m3u8_file = requests.get(url, headers=headers).text

# 通过分析每个ts下载地址的前缀
ts_preurl=os.path.dirname(url)+'/'
print (f'pre:{ts_preurl}')

# 通过正则取到ts数据的下载地址
ts_url_list = []
i=0
for line in m3u8_file.split('\n'):
    if not line.startswith('#') and not line=='':
        ts_url = ts_preurl + line
        ts_url_list.append(ts_url)
        i=i+1
input_file = 'ts.txt'
file = open(input_file, 'w', encoding='utf-8')
total=i
i=0
for ts_url in ts_url_list:
    # 定义视频名称
    ts_name = ts_url.split('/')[-1]
    i = i + 1

    # 下载视频
    if not os.path.exists(f'./tslib/{ts_name}'):
        # 获取ts视频数据
        ts_data = requests.get(url=ts_url, headers=headers).content
        with open(f'./tslib/{ts_name}', 'wb')as f:
            f.write(ts_data)
            #print("xia")
    f = f"file './tslib/{ts_name}'"  # 这里配置自己的ts文件路径就行
    file.write(f + '\n')
    print(f'视频下载成功 {i}/{total}：{ts_name}')

# ffmpeg -f concat -safe 0 -i ts.txt -c copy out.mp4

# 拼接文件名保存到ts.txt文本中
# 合并视频
# 定义导出视频的名字
output = 'output.mp4'
ffmpeg_path = r"C:\ffmpeg\ffmpeg.exe"
cmd = ffmpeg_path + " -f concat -safe 0 -i " + input_file + " -c copy " + output
print(cmd)
if os.path.exists(output):
        os.remove(output)
os.popen(cmd)
print('合并ts文件成功')

运行命令形如

python m3u8.py http://.....html

代码中ffmpeg_path自行设置，运行文件夹需有tslib的子文件夹放置临时ts文件，视频合并成功后ts文件可自行删除