引语
记录学习路程,抛砖引玉。如有更好的算法或者出现错误,欢迎指点。
词云介绍
Python提供了词云⼯具WordCloud,使⽤pip install wordcloud安装后,就可以创建⼀个词云,构造⽅法如下:
wc = WordCloud(
background_color='white', # 设置背景颜⾊
mask=backgroud_Image, # 设置背景图⽚
font_path='./SimHei.ttf', # 设置字体,针对中⽂的情况需要设置中⽂字体,否则显示乱码
max_words=100, # 设置最⼤的字数
stopwords=STOPWORDS, # 设置停⽤词
max_font_size=150, # 设置字体最⼤值
width=2000, # 设置画布的宽度
height=1200, # 设置画布的⾼度
random_state=30 # 设置多少种随机状态,即多少种颜⾊
)
创建好WordCloud类之后,就可以使⽤wordcloud=generate(text)⽅法⽣成词云,传⼊的参数text代表你要分析的⽂本,最后使⽤wordcloud.tofile(“a.jpg”)函数,将得到的词云图像直接保存为图⽚格式⽂件。
可以使⽤Python的可视化⼯具Matplotlib进⾏显示,⽅法如下:
import matplotlib.pyplot as plt
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
需要注意的是,我们不需要显示X轴和Y轴的坐标,使⽤plt.axis(“off”)可以将坐标轴关闭。
实例
在爬虫No.1的案例上进行扩展,即生成陈奕迅的词云。案例如下:
代码
import requests
import sys
import re
import os
from wordcloud import WordCloud,STOPWORDS
import matplotlib.pyplot as plt
import jieba
from PIL import Image
import numpy as np
from lxml import etree
headers = {
'Referer': 'http://music.163.com',
'Host': 'music.163.com',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'User-Agent': 'Chrome/10'
}
def get_song_lyric(headers, lyric_url):
res = requests.request('GET', lyric_url, headers=headers)
if 'lrc' in res.json():
lyric = res.json()['lrc']['lyric']
new_lyric = re.sub(r'[\d:.[\]]', '', lyric)
return new_lyric
else:
return ''
print(res.json())
def remove_stop_words(f):
stop_words = {"作词", "作曲", "编曲","" }
for stop_words in stop_words:
f = f.replace(stop_words, '')
return f
def create_word_cloud(f):
print('根据词频,开始生成词云!')
f = remove_stop_words(f)
cut_text = " ".join(jieba.cut(f, cut_all=False, HMM=True))
wc = WordCloud(
font_path="C:\Windows\Fonts\msyhbd.ttc",
max_words=100,
width=2000,
height=1200,
)
print(cut_text)
wordcloud = wc.generate(cut_text)
# 写词云图片
wordcloud.to_file("wordcloud.jpg")
# 显示词云文件
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
def get_songs(artist_id):
page_url = 'https://music.163.com/artist?id=' + artist_id
res = requests.request('GET', page_url, headers=headers)
html = etree.HTML(res.text)
href_xpath = "//*[@id='hotsong-list']//a/@href"
name_xpath = "//*[@id='hotsong-list']//a/text()"
hrefs = html.xpath(href_xpath)
names = html.xpath(name_xpath)
song_ids = []
song_names = []
for href, name in zip(hrefs, names):
song_ids.append(href[9:])
song_names.append(name)
print(href, ' ', name)
return song_ids, song_names
# 设置歌手ID,陈奕迅为2116
artist_id = '2116'
[song_ids, song_names] = get_songs(artist_id)
# 所有歌词
all_word = ''
# 获取每首歌歌词
for (song_id, song_name) in zip(song_ids, song_names):
# 歌词API URL
lyric_url = 'http://music.163.com/api/song/lyric?os=pc&id=' + song_id + '&lv=-1&kv=-1&tv=-1'
lyric = get_song_lyric(headers, lyric_url)
all_word = all_word + ' ' + lyric
print(song_name)
# 根据词频 生成词云
create_word_cloud(all_word)
词云
生成词云: