以章节为爬取单元,直接运行即可
from urllib import request
from bs4 import BeautifulSoup
import os
import shutil
import requests
host = "https://www.jove.com"
##添加章节链接
chapters = ["https://www.jove.com/science-education-library/47/cell-structure-and-function",
"https://www.jove.com/science-education-library/48/membranes-and-cellular-transport"]
##扫描章节中的lessons
lessons = {}
for i in range(len(chapters)):
print(i)
s = request.urlopen(chapters[i])
soup = BeautifulSoup(s,"html.parser")
lesson = soup.find_all('div', class_='lesson-title')
for les in lesson:
if ":" in les.find("a").getText().strip() and les.find("a").getText().strip()[0].isdigit():
lessons[les.find("a").getText().strip()] = host+les.find("a").get("href")
else:
print("不存在"+les.find("a").getText().strip())
##以txt文件格式保存字幕
for key in lessons:
print("title:"+key)
soup_lesson = BeautifulSoup(request.urlopen(lessons[key]),"html.parser")
transcript = soup_lesson.find(id="transcript")
tran_p = transcript.find_all("p")
content = ""
for i in tran_p:
content += i.getText()+"\n\n"
file = open(key+".txt","w")
file.write(content)
file.close()
print("爬取完成")