python爬虫爬取qq音乐巅峰榜热歌歌词,jieba中文分词,词云展示

先看结果

1、获取列表页信息,url为https://c.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg?tpl=3&page=detail&date=2019_02&topid=26&type=top&song_begin=0&song_num=30&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0

json样式为:

2、获取详情页

headers = {
    "authority": "c.y.qq.com",
    "method": "GET",
    "path": "/lyric/fcgi-bin/fcg_query_lyric_yqq.fcg?nobase64=1&musicid=225716644&-=jsonp1&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0",
    "scheme": "https",
    "accept": "application/json, text/javascript, */*; q=0.01",
    "accept-encoding": "gzip, deflate, br",
    "accept-language": "zh-CN,zh;q=0.9",
    "cookie": "pgv_pvi=5936793600; pt2gguin=o1952436511; RK=g+4hNa7BQD; ptcz=653047c5b0174eb6b929c242110d08693b9dfcbaa701ddbf37ccc23c3366b94c; pgv_pvid=9049425500; ts_uid=9851761599; o_cookie=1952436511; tvfe_boss_uuid=5e81ff5fb8d5a1ea; yqq_stat=0; pgv_info=ssid=s484511232; ts_refer=ADTAGbaiduald; pgv_si=s21197824; yq_index=0; player_exist=1; qqmusic_fromtag=66; yplayer_open=0; ts_last=y.qq.com/n/yqq/song/002krvKI4Jgvq9.html",
    "origin": "https://y.qq.com",
    "referer": "https://y.qq.com/n/yqq/song/002krvKI4Jgvq9.html",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
}
jsond = {
    "nobase64": "1",
    "musicid": item['data']['songid'],
    "-": "jsonp1",
    "g_tk": "5381",
    "loginUin": "0",
    "hostUin": "0",
    "format": "json",
    "inCharset": "utf8",
    "outCharset": "utf-8",
    "notice": "0",
    "platform": "yqq.json",
    "needNewCode": "0"
}
r = requests.get("https://c.y.qq.com/lyric/fcgi-bin/fcg_query_lyric_yqq.fcg", params=jsond, headers=headers)

json样式为:

3、将歌词存到文件test.txt里,用于读取。

4、逐行读取文件、构建要处理的数据字符串

5、jieba库、词云制作。

上爬虫代码:

# -*-coding:UTF-8 -*-

import json
import re
import requests

headers = {
    "authority": "c.y.qq.com",
    "method": "GET",
    "path": "/lyric/fcgi-bin/fcg_query_lyric_yqq.fcg?nobase64=1&musicid=225716644&-=jsonp1&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0",
    "scheme": "https",
    "accept": "application/json, text/javascript, */*; q=0.01",
    "accept-encoding": "gzip, deflate, br",
    "accept-language": "zh-CN,zh;q=0.9",
    "cookie": "pgv_pvi=5936793600; pt2gguin=o1952436511; RK=g+4hNa7BQD; ptcz=653047c5b0174eb6b929c242110d08693b9dfcbaa701ddbf37ccc23c3366b94c; pgv_pvid=9049425500; ts_uid=9851761599; o_cookie=1952436511; tvfe_boss_uuid=5e81ff5fb8d5a1ea; yqq_stat=0; pgv_info=ssid=s484511232; ts_refer=ADTAGbaiduald; pgv_si=s21197824; yq_index=0; player_exist=1; qqmusic_fromtag=66; yplayer_open=0; ts_last=y.qq.com/n/yqq/song/002krvKI4Jgvq9.html",
    "origin": "https://y.qq.com",
    "referer": "https://y.qq.com/n/yqq/song/002krvKI4Jgvq9.html",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
}

jsonlist={
    "tpl":"3" ,
    "page": "detail",
    "date": "2019_02",
    "topid": "26",
    "type": "top",
    "song_begin": "0",
    "song_num": "100",
    "g_tk": "5381",
    "loginUin": "0",
    "hostUin": "0",
    "format": "json",
    "inCharset": "utf8",
    "outCharset": "utf-8",
    "notice": "0",
    "platform": "yqq.json",
    "needNewCode": "0"
}
r1 = requests.get("https://c.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg", params=jsonlist)
jlist = json.loads(r1.text)
f = open('test.txt', 'a+')
for item in jlist['songlist']:
    #print (str(item['data']['songid'])+" "+item['data']['songname'])
    jsond = {
        "nobase64": "1",
        "musicid": item['data']['songid'],
        "-": "jsonp1",
        "g_tk": "5381",
        "loginUin": "0",
        "hostUin": "0",
        "format": "json",
        "inCharset": "utf8",
        "outCharset": "utf-8",
        "notice": "0",
        "platform": "yqq.json",
        "needNewCode": "0"
    }
    r = requests.get("https://c.y.qq.com/lyric/fcgi-bin/fcg_query_lyric_yqq.fcg", params=jsond, headers=headers)
    r.encoding = "utf-8"
    ch_pat = re.compile(r'[\u4e00-\u9fa5:]+')
    ch_words = ch_pat.findall(r.text)

    first = 0
    for i in range(1, int(len(ch_words) / 2)):
        if ch_words[i].find(':') > 0:
            first = i
            break
    flag = first
    for i in range(first, int(len(ch_words) / 2)):
        if ch_words[i].find(':') < 0 and ch_words[i + 1].find(':') < 0 and ch_words[i + 2].find(':') < 0:
            flag = i
            break

    #print(ch_words[flag:], "\n", flag)
    #strres = ','.join(ch_words[flag:])
    strquqita = ''
    for i in ch_words[flag:]:
        if i.find(':')<0:
            strquqita = strquqita+i+","
    #chuli = r.text.replace("&#32",'').replace('[&#58;','').replace("]&#10;",'')
    #f.write(codecs.BOM_UTF8)
    f.write(strquqita+"\n")
    print (strquqita)
f.close()
上词云代码

 

#-*-coding:UTF-8 -*-
import jieba
from wordcloud import WordCloud
f = open('test.txt', 'r+')
f.readline()
strchuli = ''
for i in f:
    strchuli = strchuli+i+"。"
wordlist = jieba.cut(strchuli, cut_all=False)
#print (len(list(wordlist)))
word_string = " ".join(wordlist)
wordcloud = WordCloud(font_path='C:\Windows\Fonts\simkai.ttf', background_color="white",width=1000, height=860, margin=2).generate(word_string)
import matplotlib.pyplot as plt
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
wordcloud.to_file('jieguo.png')
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值