小学网课,想下载所有的下来以后学习,可以人工下载,下了几个,发现一些规律,F12开发人员工具,分析了代码,果断用python批量下载,由于后台文件命名不规范,有几个还是得人工查询了id再下载。python入门,再加上没必要,所以算是半自动下载,需要自己把各年级的html文件下载下载
# -*- coding: utf-8 -*-
"""
Created on Wed May 13 14:39:55 2020
@author: xxiew
"""
import requests
from bs4 import BeautifulSoup
import re
#获取id和name
def GetIdAndName():
with open('grade3.html', encoding='UTF-8') as read_file:
htmlcontent=read_file.read()
soup=BeautifulSoup(htmlcontent,'html.parser')
items=soup.find_all('div',class_='sub-item')
itemsxs=[]
for item in items:
if '湘少' in str(item):
itemsxs.append(item)
#print(itemsxs.count)
for item in itemsxs:
idd=item['id']
print(idd)
name=soup.find(id=idd).select('span')[0].string
print(name)
getPlayerWeb(idd,name)
#下载 下载的页面vid不是父页面的id,打开页面后js中生成的新vid
def getPlayerWeb(idd,name):
url='http://yun.***jiao.com/Home/Wk/zbxq/id/'
ret = requests.get(url+str(idd))
#print(ret.text)
soup = BeautifulSoup(ret.text, "html.parser")
pattern = re.compile(r"var url = '(.*?)';", re.MULTILINE | re.DOTALL)
script = soup.find("script", text=pattern)
fullurl=pattern.search(script.text).group(1)
vid=fullurl[fullurl.find('=')+1:]
#print(vid)
urld='http://218.***.***.23:8181/index.php?a=downF&vid='+vid
html = requests.get(urld)
# content返回的是bytes型也就是二进制的数据。
html = html.content
with open('./grade3/'+name.strip()+'.mp4','wb') as f:
f.write(html)
print('done')
#根据id单独下载 因为上传文件命名不规范
def getById(idd):
with open('grade3.html', encoding='UTF-8') as read_file:
htmlcontent=read_file.read()
soup=BeautifulSoup(htmlcontent,'html.parser')
#items=soup.find_all('div',class_='sub-item')
print(idd)
name=soup.find(id=idd).select('span')[0].string
print(name)
getPlayerWeb(idd,name)
#getById(478)
#getById(623)
getById(283)
getById(143)
#GetIdAndName()
#getPlayerWeb(1794,"湘少版Unit7I'mnotafraid!(第")