英语六级听力精听资料爬取教程_mp.bookin.cn 听力爬取-CSDN博客

本文链接：https://blog.csdn.net/qq_28778001/article/details/135278819

文章讲述了如何使用Python编写代码从kk英语网站抓取并下载英语听力资料，包括音频文件和文本内容，通过requests和BeautifulSoup库实现自动暂停功能。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

英语听力精听资料批量获取

kk英语是个非常不错的网站，可以精听，但是无法做到自动暂停慧芳，对于精听党就有点难受了。所以爬取下来，用Media Study Player来断句就很nice

python抓取代码如下：

# -*- coding: utf-8 -*-

import requests
import re
from bs4 import BeautifulSoup
import os
import ssl


def DownloadFile(mp3_url,mp3_title):
    mp3_name = mp3_title + r".mp3"
    save_path = os.getcwd()
    if mp3_url is None or save_path is None or mp3_name is None:
        print('参数错误')
        return None
    # 文件夹不存在，则创建文件夹
    folder = os.path.exists(save_path)
    if not folder:
        os.makedirs(save_path)
    ssl._create_default_https_context = ssl._create_unverified_context
    requests.packages.urllib3.disable_warnings()

    # 读取MP3资源
    res = requests.get(mp3_url,stream=True,verify = False)
    # 获取文件地址
    file_path = os.path.join(save_path, mp3_name)
    print('开始写入MP3文件：', file_path)
    # 打开本地文件夹路径file_path，以二进制流方式写入，保存到本地
    with open(file_path, 'wb') as fd:
        for chunk in res.iter_content():
            fd.write(chunk)
    print(mp3_name+' 成功下载' + mp3_title)
    
def writetext(data,txtname):
    save_path = os.getcwd()
    txtname = txtname+'.txt'
    with open(os.path.join(save_path, txtname), 'w', encoding='utf-8') as file:
        file.write(data)

if __name__ == '__main__':
    url = ['http://www.kekenet.com/cet6/ljtl/','https://www.kekenet.com/cet6/ljtl/List_105.shtml','https://www.kekenet.com/cet6/ljtl/List_104.shtml','https://www.kekenet.com/cet6/ljtl/List_103.shtml','https://www.kekenet.com/cet6/ljtl/List_102.shtml','https://www.kekenet.com/cet6/ljtl/List_101.shtml']
    for i in url:
        r = requests.get(i)
        r.encoding = 'utf-8'
        match = re.compile(r'<a href="(http://www.kekenet.com/cet6/\d+/\d+\.shtml)" title=(.*?) target="_blank"')
        result = re.findall(match,r.text)
        result_url = [];
        result_title = []
        for i in result:
            url_web_mp3 = i[0]
            
            # 进入网页中下载MP3和文本内容
            response = requests.get(url_web_mp3)
            response.encoding = 'utf-8'
            # save mp3
            mp3_url = re.findall('audio src="http://k6.kekenet.com/Sound/(.*?).mp3" id="myaudio"',response.text)[0]
            mp3_url = f'http://k6.kekenet.com/Sound/{mp3_url}.mp3'
            ##print(mp3_url)
            title = re.findall('<h1 id="nrtitle">(.*?)</h1>',response.text)[0]
            ##print(title)
            DownloadFile(mp3_url,title)
            # save mp3_text
            soup = BeautifulSoup(response.text, 'html.parser')
            content_text_div = soup.find('div', {'id': 'contentText'})
            mp3_text = content_text_div.get_text(separator='\n', strip=True)
            writetext(mp3_text,title)

Reference:

1. Hoyou, 2022-06-21, [python学习]使用Python下载MP3