import sys
from lxml import etree
from bs4 import BeautifulSoup
from tqdm import tqdm
import requests
import re
import os
# 下载晓松音频(微信公众号)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
}
def get_data_list(url_path):
# 声明变量
audio_list = []
response = requests.get(url_path)
soup = BeautifulSoup(response.text, "html.parser")
js_content = soup.find(id="js_content")
regex = r'<li>.*?<a[^>]*?href="(.*?)"[^>]*?textvalue="(.*?)".*?</a>.*?</li>'
matches = re.findall(regex, js_content.encode_contents().decode('utf-8'))
for url, title in matches:
audio_list.append({'title': title, 'url': url})
return audio_list
# 获得每个连接的json
def get_mediaid_info(url):
resp = requests.get(url, headers=headers)
pattern = r'"voice_id":"(\w+)"'
match = re.search(pattern, resp.text)
if match:
return match.group(1)
return None
# 下载
def download_audio(mediaid, file_path):
url = 'https://res.wx.qq.com/voice/getvoice?mediaid=' + mediaid
resp = requests.get(url, headers=headers, stream=True)
total_size = int(resp.headers.get('content-length', 0))
# 将他拷贝到本地文件 w 写 b 二进制 wb代表写入二进制文本
with open(file_path, 'wb') as f, tqdm(
desc=f'Downloading {os.path.basename(file_path)}',
total=total_size,
unit='B',
unit_scale=True,
unit_divisor=1024,
miniters=1,
ascii=True) as progress_bar:
for data in resp.iter_content(chunk_size=1024):
if data:
f.write(data)
progress_bar.update(len(data)) # 更新进度条
# 主程序入口
# 本次爬虫是通过接口爬取,参考文章:cnblogs.com/wuliqv/p/9386143.html
if __name__ == '__main__':
dir_path = 'F:/down/'
# 晓松奇谈
# url_path = 'https://mp.weixin.qq.com/s?__biz=MzUyNzk3MjU1OQ==&mid=2247506868&idx=4&sn=c6a271f97ea94e07737ce7951a38feae&chksm=fa75fa14cd02730221a99cf910dcad6dd981f802e24df5a9e87667ec6d31cb5e67d4ba3fe4af&scene=21#wechat_redirect'
# 晓说
# url_path = 'https://mp.weixin.qq.com/s/RBqsuuEaJWQhEsRUwQJFMA'
# 历史上的今天
url_path = 'https://mp.weixin.qq.com/s/yEcYMG9wqP_fRTa7-_oJeQ'
# 获取url连接json
data_list = get_data_list(url_path)
# 获取页面里面的mediaid
for audio in data_list:
mediaid = get_mediaid_info(audio['url'])
if mediaid:
print('正在下载' + audio['title'])
file_path = dir_path + audio['title'] + '.mp3'
download_audio(mediaid, file_path)
使用python下载公众号里面的音频(晓松奇谈)
于 2024-04-18 16:59:06 首次发布