万门大学视频爬取

import requests,sys,json,os,argparse
import time
requests.packages.urllib3.disable_warnings()
'''
###########################################################################
v1.1:
1、增加课件下载功能(需安装配置aria2c)
2、增加校验文件完整性功能
3、参数化

v1.0:
初步完成!
###########################################################################
'''
def getCoursesJson(coursesID):
	# coursesID为课程id
	if args.cour_lect:
		args.coursesName,args.lectureID=args.cour_lect.split('_')
		with open(f'{args.savePath}/{args.coursesName}.json','r',encoding='UTF-8') as f:
			result=json.loads(f.read())
	else:
		resp=requests.get(f'https://api.wanmen.org/4.0/content/courses/{coursesID}',headers=args.head,verify=False)
		try:
			result=resp.json()
		except:
			print('getCoursesJson 获取CoursesJson失败!,返回的不是json数据:',resp.text)
			sys.exit()
	args.coursesName=result["name"]
	savePath=os.path.join(args.savePath,args.coursesName)
	myMkdir(savePath)
	with open(f'{savePath}/{result["name"]}.json','w',encoding='UTF-8') as f:json.dump(result,f,ensure_ascii=False)
	return savePath,result

def myMkdir(name):
	if not os.path.exists(name):
		try:os.mkdir(name)
		except Exception as e:print(f'创建目录 {name} 失败:',e)

def getM3u8UrlDic(lectureID):
	# lectureID为children里面的id
	resp=requests.get(f'https://api.wanmen.org/4.0/content/lectures/{lectureID}',headers=args.head,verify=False)
	try:
		result=resp.json()
	except:
		print('getM3u8UrlDic 获取M3u8Url失败!,返回的不是json数据:',resp.text)
		sys.exit()
	try:
		return result['name'],result['video']['hls']
		# mobileLow,pcLow,pcHigh,pcMid,mobileMid 5种清晰度
	except KeyError:
		print('getM3u8UrlDic 返回json结果有误:',result)
		sys.exit()

def check(savePath,coursesJson=0):
	from itertools import islice
	print('正在校验文件完整性……')
	fileList=[]
	for path,folder,files in islice(os.walk(savePath),1,None):#使用islice迭代,跳过第一个元素
		for file in files:
			fileList.append(file)
	if not coursesJson:
		with open(f'{savePath}/{args.coursesName}.json','r',encoding='UTF-8') as f:
			coursesJson=json.loads(f.read())
	i=0
	for lecture in coursesJson['lectures']:
		i+=1;j=0
		for children in lecture['children']:
			j+=1
			videoName=f"{i}.{j} {children['name']}.mp4"
			if videoName not in fileList:
				print('没有:',videoName,getM3u8UrlDic(children['id'])[args.type])
	for document in coursesJson['documents']:
		fileName=f"{document['name']}.{document['ext']}"
		if fileName not in fileList:
			print('没有:',fileName,document['url'])

def main(coursesID):
	savePath,coursesJson=getCoursesJson(coursesID)
	i=0
	for lecture in coursesJson['lectures']:
		i+=1;j=0
		print(i,lecture['name'])
		videoPath=f"{savePath}/第{i}讲 {lecture['name']}"
		myMkdir(videoPath)
		for children in lecture['children']:
			j+=1
			if args.cour_lect:
				args.coursesName,args.lectureID=args.cour_lect.split('_')
				if args.lectureID==children['id']:args.lectureID=False
				else:continue
			videoName,urlDic=getM3u8UrlDic(children['id'])
			print(videoName,urlDic+'aaaaaaaaaaaaaaaaaaaahhhhhhhhh!!!!\n\n\n')
			print(urlDic+'aaaaaaaaaaaaaaaaaaaahhhhhhhhh!!!!\n\n\n\n')
			time.sleep(20)
			print('\n','-'*20,f'{i}-{j}',children['name'],'-'*20)
			downloadCmd=f'ffmpeg -threads {args.threads} -i "{urlDic[args.type]}" -c copy -y -bsf:a aac_adtstoasc "{videoPath}/{i}.{j} {videoName}.mp4" -v repeat+level+info'
			os.system(downloadCmd)
	for document in coursesJson['documents']:
		fileName=f"{document['name']}.{document['ext']}"
		print('-'*20,'下载第',document['order'],'个课件:',fileName,'-'*20)
		downloadCmd=f'aria2c -s {args.threads} "{document["url"]}" -d {savePath}/课件 -o {fileName}'
		os.system(downloadCmd)
	check(savePath,coursesJson)

def getParserWM():
	parser=argparse.ArgumentParser(description='程序功能:\n    1、爬取万门大学课程;',formatter_class=argparse.RawTextHelpFormatter)
	parser.add_argument('-i',dest='coursesID',help="课程id,从课程详情中的URL中获取",required=True)
	parser.add_argument('-t',dest='type',help="选择清晰度(默认 pcHigh ):\n    mobileLow : 手机版低清\n    mobileMid : 手机版中清\n    pcLow : 电脑版低清\n    pcMid : 电脑版中清\n    pcHigh : 电脑版高清",required=False)
	parser.add_argument('-r',dest='threads',help="指定线程(默认 16)",required=False)
	parser.add_argument('-s',dest='savePath',help="指定保存路径(默认保存在脚本所在文件夹)",required=False)
	parser.add_argument('-c',dest='cour_lect',help="从指定课程的小节开始,小节ID为json文件中children中的ID,格式:课程名称_小节id",required=False)
	args=parser.parse_args()
	return args

# ffmpeg -threads 10 -i "https://media.wanmen.org/a81d11d0-f1bb-4469-b630-f1dd674081bb_pc_high.m3u8?sign=bd93952471b6413f19874fc805532bff&t=5cb14ffb&r=e8d52086889c1d207f2ce8e000f913ea" -c copy -y -bsf:a aac_adtstoasc "人工智能、大数据与复杂系统一月特训班/第1讲 复杂系统/1.4 生活实例与本章答疑.mp4" -v repeat+info
# aria2c https://docs.wanmen.org/1ce13214b7c54dc4a7e07b7f15205de0.pdf -d 教育心理学特训班/课件 -o 教育心理学14讲.pdf
if __name__ == '__main__':
	args=getParserWM()
	args.head={
		'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
		'Connection':'close',
	}
	args.type=args.type if args.type else 'pcHigh'
	args.threads=int(args.threads) if args.threads else 16
	args.savePath=args.savePath if args.savePath else os.getcwd()
	args.cour_lect=args.cour_lect if args.cour_lect else False
	# main('5c7742fc7f59616cea0ec672')#教育心理学特训班
	# main('593e086f206e46163b6dd5c8')#人工智能、大数据与复杂系统一月特训班
	main(args.coursesID)

效果图参考

  • 2
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值