指北&晓年鉴(音频下载)

import sys
from lxml import etree
from bs4 import BeautifulSoup
from tqdm import tqdm
import requests
import re
import os

# 下载晓松音频(微信公众号)
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
}


def get_data_list(url_path):
    # 声明变量
    audio_list = []

    response = requests.get(url_path)
    html_content = response.text
    soup = BeautifulSoup(html_content, "lxml")

    js_content_div = soup.find('div', {'id': 'js_content'})
    li_elements = js_content_div.find_all('li')

    for li in li_elements:
        a_element = li.find('a', {'data-linktype': '2'})
        if a_element:
            url = a_element['href']
            title = a_element['textvalue']
            audio_list.append({'title': title, 'url': url})

    return audio_list


# 获得每个连接的json
def get_mediaid_info(url):
    resp = requests.get(url, headers=headers)
    pattern = r'"voice_id":"(\w+)"'
    match = re.search(pattern, resp.text)
    if match:
        return match.group(1)
    return None


# 下载
def download_audio(mediaid, file_path):
    url = 'https://res.wx.qq.com/voice/getvoice?mediaid=' + mediaid
    resp = requests.get(url, headers=headers, stream=True)
    total_size = int(resp.headers.get('content-length', 0))
    # 将他拷贝到本地文件 w 写  b 二进制  wb代表写入二进制文本
    with open(file_path, 'wb') as f, tqdm(
            desc=f'Downloading {os.path.basename(file_path)}',
            total=total_size,
            unit='B',
            unit_scale=True,
            unit_divisor=1024,
            miniters=1,
            ascii=True) as progress_bar:
        for data in resp.iter_content(chunk_size=1024):
            if data:
                f.write(data)
                progress_bar.update(len(data))  # 更新进度条


# 主程序入口
if __name__ == '__main__':

    dir_path = 'F:/down/'

    # 指北
    # url_path = 'https://mp.weixin.qq.com/s/zOS77DD8dEgUgfehWIyweA'
    
    # 晓年鉴
    url_path = 'https://mp.weixin.qq.com/s/t9inWwVlN7niZfQNVAtf3Q'
    
    # 获取url连接json
    data_list = get_data_list(url_path)
    # 获取页面里面的mediaid
    for audio in data_list:
        mediaid = get_mediaid_info(audio['url'])
        if mediaid:
            print('正在下载' + audio['title'])
            file_path = dir_path + audio['title'] + '.mp3'
            download_audio(mediaid, file_path)
  • 2
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值