在慕课上打算一边记笔记,一边看视频,突然发现网站自带字幕(并不是所有课都带字幕),就想着怎么爬下来,作为笔记的地方底稿
本着拿来主义的想法,去网上随便找了几个代码,发现都没有办法用
于是自己写了一个, 与网上的那几个代码相比更加简短,且有效
#https://www.icourse163.org/learn/BIT-1001870002?tid=1472922453#/learn/content?type=detail&id=1259485818
course_id = 1472922453 # 这里输入网站的url中的课程编号tid
#把cookie复制到这里
cookies='XXX'
#复制cookies中的"NTESSTUDYSI"字段
csrfKey = "XXXX"
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36',
'cookie':cookies }
import requests
def get_Units(term_id):
if type(term_id) == str:
term_id = int(term_id)
url = f'https://www.icourse163.org/web/j/courseBean.getLastLearnedMocTermDto.rpc?csrfKey={csrfKey}'
data = {
'termId': term_id
}
html = dict(requests.post(url, headers=header, data=data).json())
chapters = html['result']['mocTermDto']['chapters']
lessonsIds = []
contentIds=[]
names=[]
for lessons in chapters:
for lesson in lessons["lessons"]:
for unit in lesson['units']:
if unit['contentType'] == 1:
contentIds.append(unit['contentId'])
lessonsIds.append(unit['id'])
names.append(unit['name'])
return lessonsIds,contentIds,names
def get_Subtitle(lessonsId,contentId):
url = f"https://www.icourse163.org/mm-course/web/j/mocCourseBean.getVideoSubtitle.rpc?csrfKey={csrfKey}"
data = {
"lessonUnitId": {lessonsId},
"videoId": {contentId}
}
segs=(dict(requests.post(url, headers=header, data=data).json())['result']['mergedSentences'])
sentences=''
for seg in segs:
subs = seg['sentences']
for sub in subs:
sentences=sentences+sub['text']+"。"
return sentences
if __name__ == '__main__':
lessonsIds,contentIds,names=get_Units(course_id)
Subs=[]
for i in range(len(lessonsIds)):
Subs.append(get_Subtitle(lessonsIds[i],contentIds[i]))
Res=''
for i in range(len(names)):
Res += names[i] + "\n" + Subs[i] + "\n"
with open(f"{course_id}.txt", 'a', encoding="utf-8") as file:
file.write(Res)