python爬取网易云音乐生成王力宏歌曲词云
import requests
import sys,re,os
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import jieba
from PIL import Image
import numpy as np
from lxml import etree
headers = {
'Referer' :'http://music.163.com',
'Host' : 'music.163.com',
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent' : 'Chrome/10'
}
def get_song_lyric(headers, lyric_url):
res = requests.request('GET', lyric_url,headers=headers)
if 'lrc' in res.json():
lyric = res.json()['lrc']['lyric']
new_lyric = re.sub(r'[\d:.[\]]','',lyric)
return new_lyric
else:
return ''
print(res.json())
def remove_stop_words(f):
stop_words = ['作词','作曲', '编曲', 'Arranger', '录音', '混音', '人声', 'Vocal', '弦乐', 'Keyboard', '键盘', '编辑', '助理', 'Assistants', 'Mixing', 'Editing', 'Recording', '音乐', '制作', 'Producer', '发行', 'produced', 'and', 'distributed']
for stop_word in stop_words:
f = f.replace(stop_word, '')
return f
def create_word_cloud(f):
print('根据词频,开始生成词云!')
f = remove_stop_words(f)
cut_text = ' '.join(jieba.cut(f, cut_all=False, HMM=True))
wc = WordCloud(
font_path = './wc.ttf',
max_words = 100,
width = 2000,
height = 1200,
)
print(cut_text)
wordcloud = wc.generate(cut_text)
wordcloud.to_file('wanglihong_wordcloud.jpg')
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
def get_songs(artist_id):
page_url = 'https://music.163.com/artist?id=' + artist_id
res = requests.request('GET', page_url, headers=headers)
html = etree.HTML(res.text)
href_xpath = "//*[@id='hotsong-list']//a/@href"
name_xpath = "//*[@id='hotsong-list']//a/text()"
hrefs = html.xpath(href_xpath)
names = html.xpath(name_xpath)
song_ids = []
song_names = []
for href, name in zip(hrefs, names):
song_ids.append(href[9:])
song_names.append(name)
print(href, ' ', name)
return song_ids, song_names
artist_id = '5346'
[song_ids, song_names] = get_songs(artist_id)
all_word = ''
for (song_id, song_name) in zip(song_ids, song_names):
lyric_url = 'http://music.163.com/api/song/lyric?os=pc&id=' + song_id + '&lv=-1&kv=-1&tv=-1'
lyric = get_song_lyric(headers, lyric_url)
all_word = all_word + ' ' + lyric
print(song_name)
create_word_cloud(all_word)