python爬虫爬取网易云音乐歌曲_Python网易云音乐爬虫进阶篇

image.png

年前写过一篇爬网易云音乐评论的文章,爬不了多久又回被封,所以爬下来那么点根本做不了什么分析,后面就再改了下,加入了多线程,一次性爬一个歌手最热门50首歌曲的评论,算是进阶版了~

思路梳理进入歌手页可以看到展示了该歌手的「热门50单曲」,通过BeautifulSoup获取到song_id和song_name;

image.png

Notes:

不过这里有一点需要注意,一般像平常我都会选择request或者urllib获取到页面代码,然后通过BeautifulSoup提取我们需要的数据,但去尝试了之后发现歌曲id等都是动态加载的,而像request请求操作都是瞬间完成,并不会等待页面完成加载了再获取页面代码,所以后面便使用了最安全的selenium+BeautifulSoup来获取song_id和song_name。def get_song_id(url):

driver=webdriver.PhantomJS()

driver.get(url)

time.sleep(2)

driver.switch_to_frame('g_iframe')

time.sleep(5)

web_data = driver.page_source

soup=BeautifulSoup(web_data,'lxml')

top_50_song=soup.find_all('tr', class_='even ')

driver.quit()

return top_50_song有了song_id之后就好做了,之前的文章中已经说过了,每个歌曲的评论是通过一个包含song_id的地址传递的,然后加入多线程分别保存50首歌曲,然后等着被封或者完成就好了def save_comment(song_id,song_name):

url_comment = 'http://music.163.com/weapi/v1/resource/comments/R_SO_4_'+str(song_id)+'?csrf_token='

page = get_page(url_comment,song_name)

if page == 0:

print 'IP地址被封,请稍后再试!!!'

else:

for i in range(page):

try:

params = get_params(i);

encSecKey = get_encSecKey();

json_text = get_json(url_comment, params, encSecKey)

json_dict = json.loads(str(json_text))['comments']

for t in list(range(len(json_dict))):

if t == 0:

rdata=pd.DataFrame(pd.Series(data=json_dict[t])).T

else:

rdata=pd.concat([rdata,pd.DataFrame(pd.Series(data=json_dict[t])).T])

if i == 0:

commentdata=rdata

else:

commentdata=pd.concat([commentdata,rdata])

print '***正在保存>>%s<<第%d页***'%(song_name.encode('utf-8'),i+1)

time.sleep(random.uniform(0.2,0.5))

path = song_name.encode('utf-8')+'.xlsx'

except Exception, e:

print 'IP地址被封,%s未保存完全!!!'%song_name.encode('utf-8')

commentdata.to_excel(path)

其他部分在上篇文章已经说过了,包括加密部分,可移步

Python爬虫爬取网易云音乐全部评论查看。

完整代码:# -*- coding: utf-8 -*-

#date : 2018-02-28

#author : Awesome_Tang

#version : Python 2.7.9

'''

网易云音乐评论爬虫

'''

from Crypto.Cipher import AES

import base64

import requests

import json

import time

import pandas as pd

import random

from threading import Thread

from bs4 import BeautifulSoup

from selenium import webdriver

import threading

headers = {

'Referer': 'http://music.163.com/song?id=531051217',

'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',

'Cookie': 'JSESSIONID-WYYY=%5CuiUi%5C%2FYs%2FcJcoQ5xd3cBhaHw0rEfHkss1s%2FCfr92IKyg2hJOrJquv3fiG2%2Fn9GZS%2FuDH8PY81zGquF4GIAVB9eYSdKJM1W6E2i1KFg9%5CuZ4xU6VdPCGwp4KOUZQQiWSlRT%2F1r07OmIBn7yYVYN%2BM2MAalUQnoYcyskaXN%5CPo1AOyVVV%3A1516866368046; _iuqxldmzr_=32; _ntes_nnid=7e2e27f69781e78f2c610fa92434946b,1516864568068; _ntes_nuid=7e2e27f69781e78f2c610fa92434946b; __utma=94650624.470888446.1516864569.1516864569.1516864569.1; __utmc=94650624; __utmz=94650624.1516864569.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmb=94650624.8.10.1516864569'

}

proxies = { "https": "218.94.255.11:8118",

"http": "110.73.43.110:8123",}

first_param = '{rid:"", offset:"0", total:"true", limit:"20", csrf_token:""}'

second_param = "010001"

third_param = "00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7"

forth_param = "0CoJUm6Qyw8W8jud"

def get_params(i):

if i == 0:

first_param = '{rid:"", offset:"0", total:"true", limit:"20", csrf_token:""}'

else:

offset =str(i*20)

first_param = '{rid:"", offset:"%s", total:"%s", limit:"20", csrf_token:""}'%(offset,'flase')

iv = "0102030405060708"

first_key = forth_param

second_key = 16 * 'F'

h_encText = AES_encrypt(first_param, first_key, iv)

h_encText = AES_encrypt(h_encText, second_key, iv)

return h_encText

def get_encSecKey():

encSecKey = "257348aecb5e556c066de214e531faadd1c55d814f9be95fd06d6bff9f4c7a41f831f6394d5a3fd2e3881736d94a02ca919d952872e7d0a50ebfa1769a7a62d512f5f1ca21aec60bc3819a9c3ffca5eca9a0dba6d6f7249b06f5965ecfff3695b54e1c28f3f624750ed39e7de08fc8493242e26dbc4484a01c76f739e135637c"

return encSecKey

def AES_encrypt(text, key, iv):

pad = 16 - len(text) % 16

text = text + pad * chr(pad)

encryptor = AES.new(key, AES.MODE_CBC, iv)

encrypt_text = encryptor.encrypt(text)

encrypt_text = base64.b64encode(encrypt_text)

return encrypt_text

def get_json(url, params, encSecKey):

data = {

"params": params,

"encSecKey": encSecKey

}

response = requests.post(url, headers=headers, data=data,proxies = proxies ,timeout =5)

return response.content

def get_page(url,song_name):

params = get_params(0);

encSecKey = get_encSecKey();

json_text = get_json(url, params, encSecKey)

json_dict = json.loads(json_text)

try:

total_comment = json_dict['total']

page=(total_comment/20)+1

print '***查询到歌曲>>>%s<<<评论共计%d条,%d页***'%(song_name.encode('utf-8'),total_comment,page)

return page

except Exception, e:

return 0

def save_comment(song_id,song_name):

url_comment = 'http://music.163.com/weapi/v1/resource/comments/R_SO_4_'+str(song_id)+'?csrf_token='

page = get_page(url_comment,song_name)

if page == 0:

print 'IP地址被封,请稍后再试!!!'

else:

for i in range(page):

try:

params = get_params(i);

encSecKey = get_encSecKey();

json_text = get_json(url_comment, params, encSecKey)

json_dict = json.loads(str(json_text))['comments']

for t in list(range(len(json_dict))):

if t == 0:

rdata=pd.DataFrame(pd.Series(data=json_dict[t])).T

else:

rdata=pd.concat([rdata,pd.DataFrame(pd.Series(data=json_dict[t])).T])

if i == 0:

commentdata=rdata

else:

commentdata=pd.concat([commentdata,rdata])

print '***正在保存>>%s<<第%d页***'%(song_name.encode('utf-8'),i+1)

time.sleep(random.uniform(0.2,0.5))

path = song_name.encode('utf-8')+'.xlsx'

except Exception, e:

print 'IP地址被封,%s未保存完全!!!'%song_name.encode('utf-8')

commentdata.to_excel(path)

def get_song_id(url):

driver=webdriver.PhantomJS(executable_path='/Users/XXXXX/phantomjs-2.1.1-macosx/bin/phantomjs')

driver.get(url)

time.sleep(2)

driver.switch_to_frame('g_iframe')

time.sleep(5)

web_data = driver.page_source

soup=BeautifulSoup(web_data,'lxml')

top_50_song=soup.find_all('tr', class_='even ')

driver.quit()

return top_50_song

if __name__ == "__main__":

start_time = time.time()

url_artist = 'http://music.163.com/#/artist?id=46376'

top_50_song = get_song_id(url_artist)

thread_list = []

for song in top_50_song:

song_id = song.select('td.w1 div span.ply ')[0].get('data-res-id')

song_name = song.select('span.txt b')[0].get('title').replace(u'\xa0',u' ')

my_thread = threading.Thread(target=save_comment, args=(str(song_id),song_name))

my_thread.setDaemon(True)

thread_list.append(my_thread)

for my_thread in thread_list:

my_thread.start()

for my_thread in thread_list:

my_thread.join()

end_time = time.time()

print "程序耗时%f秒." % (end_time - start_time)

print '***NetEase_Music_Spider@Awesome_Tang***'

Peace~

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
好的,我可以为您提供一个大致的思路: 1. 使用 Python 的 requests 和 BeautifulSoup 库爬取网易云音乐 Top105 的页面,获取每首歌曲的排名、歌曲名、歌手和歌曲 ID 等信息。 2. 使用 Python 的 mysql-connector 库连接到 MySQL 数据库,创建一个表来存储这些信息。 3. 遍历每首歌曲的信息,将其插入到数据库表中。可以使用 SQL 语句的 INSERT INTO 命令来实现。 下面是一个简单的示例代码,仅供参考: ```python import requests from bs4 import BeautifulSoup import mysql.connector # 爬取 Top105 页面 url = 'https://music.163.com/discover/toplist?id=3778678' r = requests.get(url) soup = BeautifulSoup(r.text, 'html.parser') # 连接到 MySQL 数据库 cnx = mysql.connector.connect(user='yourusername', password='yourpassword', host='127.0.0.1', database='yourdatabase') cursor = cnx.cursor() # 创建一个表来存储歌曲信息 create_table = """ CREATE TABLE IF NOT EXISTS Top105 ( id INT(11) NOT NULL AUTO_INCREMENT, rank INT(11), title VARCHAR(255), artist VARCHAR(255), song_id VARCHAR(255), PRIMARY KEY (id) ) """ cursor.execute(create_table) # 遍历每首歌曲的信息,将其插入到数据库表中 songs = soup.select('ul.f-hide li') for song in songs: rank = song.select_one('span').text.strip() title = song.select_one('a').text.strip() artist = song.select_one('span:nth-of-type(2)').text.strip() song_id = song.select_one('a')['href'].split('=')[-1] insert_song = f"INSERT INTO Top105 (rank, title, artist, song_id) VALUES ({rank}, '{title}', '{artist}', '{song_id}')" cursor.execute(insert_song) # 提交更改和关闭连接 cnx.commit() cursor.close() cnx.close() ``` 需要注意的是,此代码仅供参考,具体实现可能需要根据您的需求进行修改。同时,由于网易云音乐的反爬虫机制较为严格,因此在实际使用时需要注意不要过于频繁地访问页面,以免被封禁 IP。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值