python 爬取网上数据Crawler data(4.音乐)

冒雨前行的蜗牛

已于 2022-08-29 15:56:15 修改

阅读量157

点赞数

分类专栏：笔记文章标签： python 爬虫开发语言

于 2022-05-15 20:37:06 首次发布

本文链接：https://blog.csdn.net/ZENGshuihai/article/details/124773529

版权

Python Selenium BeautifulSoup 音乐下载网络爬虫

关键词由CSDN通过智能技术生成

笔记专栏收录该内容

37 篇文章 1 订阅

订阅专栏

1.元素获取:

from tqdm import tqdm
import requests
from selenium import driver
from bs4 import BeautifulSoup

def get_cont(target):
 
    webbrowser = driver.Opera()
    webbrowser.get(url=target)
    text = BeautifulSoup(webbrowser.page_source,'lxml')
    bs = text.find_all('audio')
    for texts in bs:
        headers ={'User-Agent':  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)}
        music_url = texts.get('src')
        response = requests.get(music_url,headers = headers).content
        return response

if __name__ =='__main__':
    server = 'https://www.kugou.com/yy/special/single/3672076.html'
    browser = driver.Opera()
    browser.get(server)
    soup = BeautifulSoup(browser.page_source, 'lxml')
    texts_bs = soup.find('div',class_='list1')
    texts_bs = texts_bs.find_all('a')
    for song_url in tqdm(texts_bs):
        try:
            if song_url.get('href').strip().split('.')[-1] == 'html':
                url = song_url.get('href')
                name = song_url.get('title')
                response = get_cont(url)
               
                with open('音乐\%s'%(name + '.mp3'),'wb') as f:
                    f.write(response)
                    f.close()
            else:
                continue
        except IndexError as e:
            continue

2.选取目标,网页解析:

直接打开相关下载网页下载就行(已经加密mp3网页地址):
在这里插入图片描述
3.分析网页要获取的元素id:

import requests 
from bs4 import BeautifulSoup
from selenium import webdriver 

HEADERS = {
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'referer': 'https://music.163.com/',
    'x-csrf-token': '',
    'x-requested-with': 'XMLHttpRequest',
    'cookie': ''
    ,
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko)'
                  ' Chrome/90.0.4430.212 Safari/537.36'}

#抓取对象
POPULAR_URL = "https://music.163.com/discover/toplist?id=19723756"
server = 'http://music.163.com/song/media/outer/url?id='
browser = webdriver.Opera()
#获取网页元素:

browser.get(POPULAR_URL)
soup = BeautifulSoup(browser.page_source, 'lxml')

4.拿id[ :10]

texts = webtext.select('ul.f-hide > li > a')
for text in texts :
	id  = text.get('href')
  name = text.string

5.下载:

    for text in texts:
        url = server + id.split('=')[-1] + '.mp3'
        name = text.string
        print(name,url)

        
response = resquetst.get(url,headers=HEADERS)
with open('音乐\%s'%(name + '.mp3'),'wb') as f:
    f.write(response.content)
    f.close()

6.完整代码:

import requests 
from bs4 import BeautifulSoup
from selenium import webdriver 
from tqdm import tqdm

HEADERS = {
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'referer': 'https://music.163.com/',
    'x-csrf-token': '',
    'x-requested-with': 'XMLHttpRequest',
    'cookie': ''
    ,
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko)'
                  ' Chrome/90.0.4430.212 Safari/537.36'}

server_url = 'https://music.163.com/discover/toplist?id=19723756'
server = 'http://music.163.com/song/media/outer/url?id='

if __name__ == '__main__':
    response = requests.get(server_url,headers=HEADERS
    response.encoding = 'utf-8'
    webtext = BeautifulSoup(response.text, 'lxml')
    texts = webtext.select('ul.f-hide > li > a') 
	  for text in tdqm(texts):
        url = server + text.get('href').split('=')[-1] + '.mp3'
        name = text.string
        print(name,url)
	     try:
        	 response = resquetst.get(url,headers=HEADERS)
       	     with open('音乐\%s'%(name + '.mp3'),'wb') as f:
             	f.write(response.content)
             	f.close() 
	     except FileNotFoundError as f:
              continue