python对流媒体数据的爬取
废话不多说,先上代码:
import requests
import time
import os
from tkinter import *
root=Tk()
root.title(".m3u8视频的爬取与合成!")
root.geometry("400x200+500+300")
lb=Label(root,text="请输入.m3u8文件的路径:")
lb.grid(row=0,column=0)
entry=Entry(root)
entry.grid(row=0,column=1)
lb=Label(root,text="请输入base_url:")
lb.grid(row=1,column=0)
entry1=Entry(root)
entry1.grid(row=1,column=1)
lb=Label(root,text="请输入.ts文件的存储路径:")
lb.grid(row=2,column=0)
entry2=Entry(root)
entry2.grid(row=2,column=1)
lb=Label(root,text="请输入组合好的文件的存放位置:")
lb.grid(row=3,column=0)
entry3=Entry(root)
entry3.grid(row=3,column=1)
lb=Label(root,text="请输入组合好的视频文件的文件名:")
lb.grid(row=4,column=0)
entry4=Entry(root)
entry4.grid(row=4,column=1)
lb=Label(root,text="请输入.ts文件开始下载的序号:")
lb.grid(row=5,column=0)
entry5=Entry(root)
entry5.grid(row=5,column=1)
# request请求头,预防反爬
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4098.3 Safari/537.36'}
# 从m3u8文件中取出并生成ts文件的下载链接
def get_ts_urls():
urls = []
with open(entry.get(), "r") as file:
lines = file.readlines()
for line in lines:
if line.startswith('seg'):
urls.append(entry1.get() + line.strip("\n"))
print(".ts文件总共有" + str(len(urls)) + "个")
print("\n")
print("开始下载.ts文件…………")
return urls
#下载每个链接所对应的.ts文件
def get_ts_files(urls):
for url in range(int(entry5.get()),len(urls)):
print('开始下载第' + str(url+1) + '个.ts文件')
start = time.time()
size = 0
response = requests.get(urls[url], stream=True, headers=headers)
chunk_size = 1024
content_size = int(response.headers['content-length'])
if response.status_code == 200:
print('[文件大小]:%0.2f MB' % (content_size / chunk_size / 1024))
with open(entry2.get()+ '\\' + str(url) + '.ts', 'wb') as f:
for data in response.iter_content(chunk_size=chunk_size):
f.write(data)
size = size + len(data)
print('\r' + '[下载进度]:%s%.2f%%' % (
'■' * int(size * 50 / content_size), float(size / content_size * 100)), end='')
end = time.time()
print('\n' + '第' + str(url+1) + '个.ts文件下载完成! 用时%.2f秒' % (end - start))
time.sleep(1)
print('\n')
print("所有.ts文件都已下载完成!")
#对.ts文件进行排序,确保视频合成后的正确性
def get_sort_ts(path):
file_list = []
for root, dirs, files in os.walk(path):
files.sort(key=lambda x : int(x[0:-3]))
for fn in files:
p = str(root + "\\" + fn)
file_list.append(p)
return file_list
#对排序好的所有.ts文件进行拼接
def combine():
file_list = get_sort_ts(entry2.get())
file_path = entry3.get()+"\\"+entry4.get() + '.mp4'
with open(file_path, 'wb+') as fw:
for i in range(len(file_list)):
fw.write(open(file_list[i], 'rb').read())
os.remove(file_list[i])#对.ts文件进行拼接后删除每个.ts文件
os.remove(entry.get())#删除.m3u8文件
print("\n")
print("合成视频完成!")
def wancheng():
urls=get_ts_urls()
get_ts_files(urls)
combine()
btn=Button(root,text="确认下载",fg='black',relief="raised",bd="9",command=wancheng)
btn.grid(row=6,column=1)
root.mainloop()
运行效果图:
详细分析:
.m3u8文件可以通过浏览器(360浏览器看不了)的web开发这工具获取,如图:
url_base的获取:
运行代码后输入相关内容:
爬取的过程截图: