某教育网站视频抓取
直接上代码
import requests
import json
import time
import urllib3
from Crypto.Cipher import AES
import os
import re
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import binascii
import threadpool
headers = {
"Cookie": "",#手动获取
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36",
}
path = "./source"
def videoList():
url = "https://uc.tmooc.cn/video/findVideoList"
data = {
"courseId": "2d92cb7a25264928b1f1408dc3842663"
}
response = requests.post(url,data=data,headers=headers,verify=False)
return json.loads(response.text)
def checkVideo(item):
url = "https://uc.tmooc.cn/video/checkVideo"
data = {
"courseId":"2d92cb7a25264928b1f1408dc3842663",
"stageId": item["stageId"],
"videoId": item["id"],
"_":str(int(time.time()*3))
}
response = requests.get(url,params=data,headers=headers,verify=False)
data = json.loads(response.text)
return data
def getvideofile(item):
url = "https://p.bokecc.com/servlet/getvideofile"
if item["obj"].get("") is not None:
guid = item["obj"]["lookBackGuid"]
else:
guid = item["obj"]["guid"]
params = {
"vid":guid,
"siteid":"0DD1F081022C163E",
"width":"100%",
"useragent":"other",
"version":"20140214",
"hlssupport":"1",
"vc":item["name"],
"mediatype":"undefined",
"divid":"cc_video_"+guid+"_3692846",
"callback":"cc_jsonp_callback_453816",
"r":"7263265.342129019",
}
response = requests.get(url,params=params,headers=headers,verify=False)
v = response.text[len("cc_jsonp_callback_453816")+1:-1]
data = json.loads(v)
return data
def get_key(uri):
response = requests.get(uri,headers=headers,verify=False)
return response.content
def get_mp4(uri):
response = requests.get(uri,headers=headers,verify=False)
return response.content
def merge(file,playurl,des):
lines = file.split('\n')
cipher = None
des_dir = os.path.join(path,des)
base_dir = os.path.dirname(des_dir)
if not os.path.exists(base_dir):
os.makedirs(base_dir)
f = open(des_dir,"wb")
for item in lines:
if item.strip():
if item.startswith("#EXT-X-KEY"):
result1 = re.findall("URI=\"(.*?)\"",item)
uri = result1[0]
result2 = item.split("=")[-1]
iv = binascii.unhexlify(bytes(result2[2:],encoding="utf-8"))
key = get_key(uri)
cipher = AES.new(key, AES.MODE_CBC, iv)
print(cipher)
elif item.startswith("#EXT"):
continue
else:
uri = playurl.split('?')[0]
uri = uri[:uri.rfind("/")]+'/'+item
print(uri)
content = get_mp4(uri)
data = cipher.decrypt(content)
f.write(data)
f.close()
def get_m3u8(item,des):
playurl = item["copies"][0]["playurl"]
response = requests.get(playurl,headers=headers,verify=False)
merge(response.text,playurl,des)
def downloads():
video_list = videoList()
target_list = []
for index1,item in enumerate(video_list["list"]):
cur_dir = str(index1+1) +"."+item["name"]
for index2,content in enumerate(item["list"]):
des = os.path.join( cur_dir,str(index2+1) +"."+content["name"]+".mp4")
print(des)
video = checkVideo(content)
videofile = getvideofile(video)
v = {"item":videofile,"des":des}
target_list.append((None,v))
return target_list
def run(target_list):
print("start:")
pool = threadpool.ThreadPool(10)
requests = threadpool.makeRequests(get_m3u8, target_list)
[pool.putRequest(req) for req in requests]
pool.wait()
print("end!")
items = downloads()
run(items)
运行中
运行结果
有问题+v zp953362984