Python爬取EF每日英语资源
作为程序猿,英文的重要程度不用多少,大家都是知道的。那么今天为大家分享一个如何用Python获取英语学习资源的一个案例。目标网站是英孚教育。(学习目的,请问商用。如果侵犯到了您的权益,请联系我,我会删除此文章。)
网站:http://center.ef.com.cn/blog/
需要的库:
- python3.x
- urllib
- requestes
- bs4
- json
目标网站分析
-
1:http://center.ef.com.cn/blog/lesson?lesson_id=457&view=video
2:http://center.ef.com.cn/blog/lesson?lesson_id=458&view=video
3:http://center.ef.com.cn/blog/lesson?lesson_id=459&view=video
。。。
32:http://center.ef.com.cn/blog/lesson?lesson_id=488&view=video
-
MP4 根目录:https://www.englishtown.cn
具体代码实现
from bs4 import BeautifulSoup
import urllib.request
import requests
import json
import os
import time
class EF():
'''
整个EF项目的主体
'''
def __init__(self):
self.baseUrl = "https://www.englishtown.cn/community/dailylesson/lessonhandler.ashx?operate=preloaddata&teachculturecode=en&ss=EE&v=4&"
self.header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'}
self.name_url = "https://www.englishtown.cn/community/dailylesson/lessonhandler.ashx?operate=getlessonbyid&v=4&"
def getRequsetContent(self, url):
'''
获取页面请求
'''
try:
req = urllib.request.Request(url, headers=self.header)
response = urllib.request.urlopen(req, timeout=10)
except:
print("页面加载失败")
else:
return response.read().decode('UTF-8')
def spyder(self, url, name_url):
html = self.getRequsetContent(url)
html4name = self.getRequsetContent(name_url)
data_dict = json.loads(html)
data_dict_name = json.loads(html4name)
lesson = data_dict_name['Lesson']['LessonNameWithPerfix']
not_allow = ['/', '\\', ':', '*', "'" , '"', "<", ">", "|", "?", '\r', '\n'] # 非法字符
lesson_name = lesson.split("-")[1]
for char in not_allow:
if char in lesson_name:
lesson_name = lesson_name.replace(char, '_')
# 创建文件夹
if not os.path.exists(lesson_name):
os.mkdir(lesson_name)
slides = data_dict['Slides'][0]
localizedSlides = slides['LocalizedSlides']
en = localizedSlides['en']
mediaSource = en['MediaSource'] # MP4地址
mp4_url = "https://www.englishtown.cn" + mediaSource
dialogue = en['Dialogue']
sentences = dialogue['Sentences']
en_list = [] # 保存的是英文
cn_list = [] # 保存的是中文
mp3_list = [] # 保存的是MP3
for sentence in sentences:
text = sentence['Sentence']['Text']
mp3 = sentence['Sentence']['SentenceAudioSrc']
trans = sentence['Trans']['zh-cn']['Text']
en_list.append(text)
mp3_list.append(mp3)
cn_list.append(trans)
for en, cn, mp3 in zip(en_list, cn_list, mp3_list):
print("英文:{}, 中文:{}, MP3:{}".format(en, cn, mp3))
with open(lesson_name + "\\sentences.txt", 'a') as f:
f.write("英文:{}, 中文:{}, MP3:{}".format(en, cn, cn + "mp3"))
f.write("\n")
mp3_url = "https://cns.ef-cdn.com/_snds/wiktionary/sentences/" + mp3
self.DL(mp3_url, lesson_name, cn)
time.sleep(0.5)
self.DL(mp4_url, lesson_name, "")
def DL(self, url, fd_name, mp3_name):
res = requests.get(url, headers = self.header)
if mp3_name == "":
fn = fd_name + ".mp4"
else:
fn = mp3_name + ".mp3"
with open(fd_name + "\\" + fn, 'wb') as f:
f.write(res.content)
if __name__ == "__main__":
ef = EF()
for i in range(457, 489):
url = ef.baseUrl + "lesson_id={}&transculturecode=zh-cn".format(i)
name_url = ef.name_url + "lesson_id={}&transculturecode=zh-cn".format(i)
ef.spyder(url, name_url)
time.sleep(1)
注意点
- 获取课程名称的地址和获取句子的url不一样,需要爬取两次
- 保存文件时,注意文件名中不能有特殊符号
- 本项目的难点在于对JSON的解析,层级比较深,但整体还是比较好理解的。
- 待优化的地方:这个项目没有加线程,所以下载速度较慢,大家可以自行修改一下。
- 另外,URL有待优化,我这里只是爬取了等级4下面的内容。