英语听力精听资料批量获取
kk英语是个非常不错的网站,可以精听,但是无法做到自动暂停慧芳,对于精听党就有点难受了。所以爬取下来,用Media Study Player来断句就很nice
python抓取代码如下:
# -*- coding: utf-8 -*-
import requests
import re
from bs4 import BeautifulSoup
import os
import ssl
def DownloadFile(mp3_url,mp3_title):
mp3_name = mp3_title + r".mp3"
save_path = os.getcwd()
if mp3_url is None or save_path is None or mp3_name is None:
print('参数错误')
return None
# 文件夹不存在,则创建文件夹
folder = os.path.exists(save_path)
if not folder:
os.makedirs(save_path)
ssl._create_default_https_context = ssl._create_unverified_context
requests.packages.urllib3.disable_warnings()
# 读取MP3资源
res = requests.get(mp3_url,stream=True,verify = False)
# 获取文件地址
file_path = os.path.join(save_path, mp3_name)
print('开始写入MP3文件:', file_path)
# 打开本地文件夹路径file_path,以二进制流方式写入,保存到本地
with open(file_path, 'wb') as fd:
for chunk in res.iter_content():
fd.write(chunk)
print(mp3_name+' 成功下载' + mp3_title)
def writetext(data,txtname):
save_path = os.getcwd()
txtname = txtname+'.txt'
with open(os.path.join(save_path, txtname), 'w', encoding='utf-8') as file:
file.write(data)
if __name__ == '__main__':
url = ['http://www.kekenet.com/cet6/ljtl/','https://www.kekenet.com/cet6/ljtl/List_105.shtml','https://www.kekenet.com/cet6/ljtl/List_104.shtml','https://www.kekenet.com/cet6/ljtl/List_103.shtml','https://www.kekenet.com/cet6/ljtl/List_102.shtml','https://www.kekenet.com/cet6/ljtl/List_101.shtml']
for i in url:
r = requests.get(i)
r.encoding = 'utf-8'
match = re.compile(r'<a href="(http://www.kekenet.com/cet6/\d+/\d+\.shtml)" title=(.*?) target="_blank"')
result = re.findall(match,r.text)
result_url = [];
result_title = []
for i in result:
url_web_mp3 = i[0]
# 进入网页中下载MP3和文本内容
response = requests.get(url_web_mp3)
response.encoding = 'utf-8'
# save mp3
mp3_url = re.findall('audio src="http://k6.kekenet.com/Sound/(.*?).mp3" id="myaudio"',response.text)[0]
mp3_url = f'http://k6.kekenet.com/Sound/{mp3_url}.mp3'
##print(mp3_url)
title = re.findall('<h1 id="nrtitle">(.*?)</h1>',response.text)[0]
##print(title)
DownloadFile(mp3_url,title)
# save mp3_text
soup = BeautifulSoup(response.text, 'html.parser')
content_text_div = soup.find('div', {'id': 'contentText'})
mp3_text = content_text_div.get_text(separator='\n', strip=True)
writetext(mp3_text,title)
Reference:
1. Hoyou, 2022-06-21, [python学习]使用Python下载MP3