Python大作业
内容简介:
用Python来爬取酷狗音乐TOP500的歌曲信息,统计这500首歌曲中出现的所有歌手,并做可视化处理生成词云
实验代码:
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
from matplotlib import pyplot as plt
from wordcloud import WordCloud
from PIL import Image
import numpy as np
def main():
dic = get_data()
items = process_data(dic)
print(len(items), items)
word_cloud(items)
def get_data():
dic = {}
for i in range(1, 24):
urls = 'https://www.kugou.com/yy/rank/home/%d-8888.html?from=rank' % i
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/92.0.4577.63 Safari/537.36}"} # 表示告诉要访问的网站我们是什么浏览器
html = requests.get(urls, headers=head)
soup = BeautifulSoup(html.text, 'lxml')
titles = soup.select('.pc_temp_songname')
href = soup.select('.pc_temp_songname')
times = soup.select('.pc_temp_time')
data_all = []
for titles, times, href in zip(titles, times, href):
data = {
# '排行': No.get_text().strip().replace('\n', '').replace('\t', '').replace('\r', ''),
'歌曲名称': titles.get_text().replace('\n', '').replace('\t', '').replace('\r', '').split('-')[0],
'歌手': titles.get_text().replace('\n', '').replace('\t', '').replace('\r', '').split('-')[1],
'歌曲时长': times.get_text().strip().replace('\n', '').replace('\t', '').replace('\r', ''),
'歌曲链接': href.get('href')}
print(data)
cnt_songer(data['歌手'], dic)
data_all.append(data)
# data_frame(data_all)
time.sleep(2)
return dic
# def data_frame(data_all): # 将数据写入到酷狗TOP500的txt文档中
# df = pd.DataFrame(index=["排行"], columns=["歌曲名称", "歌手", "歌曲时长", "歌曲链接"])
# p = 1
# for j in data_all:
# df.loc[p] = j
# p += 1
# df.to_csv("酷狗TOP500.txt", sep='\t', index=True, header=True)
def cnt_songer(songer, dic): # 词频统计:若字典中无当前词语则创建一个键值对,若有则将原有值加1
if songer not in dic:
dic[songer] = 1
else:
dic[songer] = dic[songer] + 1
def process_data(dic): # 将无序的字典类型转换为有序的字典
items = dict(sorted(dic.items(), key=lambda x: x[1], reverse=True))
return items
def word_cloud(items): # 可视化之词云
img = Image.open(r'./tree.jpg') # 图片样板
imgarr = np.array(img)
wc = WordCloud(
background_color='white',
mask=imgarr,
font_path="C:\Windows\Fonts\msyh.ttc",
scale=15
)
wc.generate_from_frequencies(items)
plt.figure(1)
plt.imshow(wc)
plt.axis('off')
plt.show()
wc.to_file("酷狗TOP500词云.png")
if __name__ == '__main__':
main()
最后生成的词云:
词云样式:
注:本次代码仅仅用来提交实验大作业,所以其中可能会有一些bug,代码的格式也可能不标准