用空间说说做词云,有趣好玩,颜值爆表
哈喽大家好,我跟大家分享Python的一个有趣玩法:用QQ空间的说说做词云。
材料准备
首先我们准备好看的血小板一只。
然后准备好Python3。
准备selenium、requests、jieba、wordcloud、matplotlib等Python包。
- 打开终端或cmd,运行:
pip install selenium
pip install requests
pip install jieba
pip install wordcloud
pip install matplotlib
准备Chrome浏览器一只。
准备chromedriver一只(https://npm.taobao.org/mirrors/chromedriver/2.41/),下载解压后放到喜欢的路径即可。
准备QQ号一只。
开始制作
下一步我们开始制作。
首先我们导入Python包。
# -*- coding: utf-8 -*-
from selenium import webdriver
import requests
import re
import jieba.analyse
import json
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from pylab import imread, plt
import random
导入不成功的同学需要重新pip安装对应的包。
加入请求头和你自己的账号密码。
headers = {'User-Agent': 'app of oo 1.0'}
user = '13541*****'
password = '**********'
然后使用selenium模拟登陆QQ空间,拿到cookies和qzonetoken。
def login():
driver = webdriver.Chrome(executable_path='D:/chromedriver.exe')
# 登陆
driver.get('https://qzone.qq.com/')
driver.switch_to.frame('login_frame')
driver.find_element_by_id('switcher_plogin').click()
driver.find_element_by_id('u').clear()
driver.find_element_by_id('u').send_keys(user)
driver.find_element_by_id('p').clear()
driver.find_element_by_id('p').send_keys(password)
driver.find_element_by_id('login_button').click()
# 转主页
driver.get('https://user.qzone.qq.com/{}/main'.format(user))
# 进说说
driver.find_element_by_id('QM_Profile_Mood_A').click()
html = driver.page_source
g_qzonetoken = re.search('window\.g_qzonetoken = \(function\(\)\{ try\{return (.*?);\} catch\(e\)',html)
qzonetoken = str(g_qzonetoken[0]).split('\"')[1]
cookies = driver.get_cookies()
return get_cookie(cookies), qzonetoken
其中,组装cookie的方法如下。
def get_cookie(cookies):
cookie_dict = {}
for cookie in cookies:
cookie_dict[cookie['name']] = cookie['value']
return cookie_dict
然后,获取g_tk。
def get_g_tk(cookie):
hashes = 5381
for letter in cookie['p_skey']:
hashes += (hashes << 5) + ord(letter)
return hashes & 0x7fffffff
下一步我们开始抓取说说。
def get_words(cookies, g_tk, qzonetoken):
words = []
session = requests.session()
i = 0
while True:
params = {
'uin': user,
'pos': i * 20,
'num': 20,
'replynum': 100,
'g_tk': [g_tk, g_tk],
'callback': '_preloadCallback',
'code_version': '1',
'format': 'jsonp',
'need_private_comment': '1',
'qzonetoken': qzonetoken
}
respond = session.get('https://h5.qzone.qq.com/proxy/domain/taotao.qq.com/cgi-bin/emotion_cgi_msglist_v6', params=params, headers=headers, cookies=cookies)
data_str = re.findall('^_preloadCallback\((.*?)\);$', respond.text)[0]
data_dict = json.loads(data_str)
msg_list = data_dict['msglist']
for msg in msg_list:
word = msg['content'].replace('\n', '')
# 除去表情
emojis = re.findall('\[em\].*?\[/em\]', word)
for emoji in emojis:
word = word.replace(emoji, '')
# 除去@
friends = re.findall('@{.*?}', word)
for friend in friends:
word = word.replace(friend, '')
words.append(word)
if len(msg_list) < 20:
return words
i += 1
然后用结巴分词抽取关键词,结巴分词是比较流行的一个中文分词包,尽管对专业领域的分词效果尚待商榷,但对于一般业余玩家而言已经足够。取关键词的数量没有限制,想要更多词的同学可以多拿,但一般来说取关键词数量越多,划分粒度越细,关键词的“关键性”越低。
def get_keywords(words_list):
keywords = []
for word in words_list:
# 取前K个关键词
top_k = round(len(word) / 5) + 1
keywords += jieba.analyse.extract_tags(word, topK=top_k)
# 打乱顺序
random.shuffle(keywords)
return ' '.join(keywords)
最后我们制作词云。我们加入血小板图片一张作为背景,不喜欢血小板的同学可以不加。
值得注意的是,matplotlib默认不能显示中文,需要设置词云的字体路径,如本人使用win10的微软雅黑字体,则设为C:/Windows/Fonts/msyh.ttc。
def make_cloud(keywords_str):
# 背景掩模
color_mask = imread("xuexiaoban.jpg")
wc = WordCloud(
font_path="C:/Windows/Fonts/msyh.ttc",
background_color="white",
max_words=2000,
stopwords=STOPWORDS.add("said"),
mask=color_mask,
max_font_size=200,
random_state=100
)
image_colors = ImageColorGenerator(color_mask)
word_cloud = wc.generate(keywords_str)
plt.imshow(word_cloud.recolor(color_func=image_colors))
plt.axis('off')
plt.show()
一个有趣好玩,颜值爆表的词云就完成了。
技术总结
- 使用selenium模拟登录,并取得cookie、qzonetoken和g_tk。
- 用requests和正则表达式抓取说说。
- 用jieba抽取关键词。
- 用wordcloud制作词云并用matplotlib显示。不喜欢血小板的同学可以不加,喜欢凤姐的同学也可以用凤姐替代。
用python制作说说词云的技术总结完毕。
def main():
cookies, qzonetoken = login()
g_tk = get_g_tk(cookies)
words = get_words(cookies, g_tk, qzonetoken)
keywords_str = get_keywords(words)
make_cloud(keywords_str)
if __name__ == '__main__':
main()
参考文献
[3] 美食作家王刚
附完整代码
# -*- coding: utf-8 -*-
from selenium import webdriver
import requests
import re
import jieba.analyse
import json
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from pylab import imread, plt
import random
headers = {'User-Agent': 'app of oo 1.0'}
user = '13541*****'
password = '**********'
# 这个函数用来解决腾讯g_tk加密算法的函数
def get_g_tk(cookie):
hashes = 5381
for letter in cookie['p_skey']:
hashes += (hashes << 5) + ord(letter)
return hashes & 0x7fffffff
def get_cookie(cookies):
cookie_dict = {}
for cookie in cookies:
cookie_dict[cookie['name']] = cookie['value']
return cookie_dict
def login():
driver = webdriver.Chrome(executable_path='D:/chromedriver.exe')
# 登陆
driver.get('https://qzone.qq.com/')
driver.switch_to.frame('login_frame')
driver.find_element_by_id('switcher_plogin').click()
driver.find_element_by_id('u').clear()
driver.find_element_by_id('u').send_keys(user)
driver.find_element_by_id('p').clear()
driver.find_element_by_id('p').send_keys(password)
driver.find_element_by_id('login_button').click()
# 转主页
driver.get('https://user.qzone.qq.com/{}/main'.format(user))
# 进说说
driver.find_element_by_id('QM_Profile_Mood_A').click()
html = driver.page_source
g_qzonetoken = re.search('window\.g_qzonetoken = \(function\(\)\{ try\{return (.*?);\} catch\(e\)',html)
qzonetoken = str(g_qzonetoken[0]).split('\"')[1]
cookies = driver.get_cookies()
return get_cookie(cookies), qzonetoken
def get_keywords(words_list):
keywords = []
for word in words_list:
# 取前K个关键词
top_k = round(len(word) / 5) + 1
keywords += jieba.analyse.extract_tags(word, topK=top_k)
# 打乱顺序
random.shuffle(keywords)
return ' '.join(keywords)
def make_cloud(keywords_str):
# 背景掩模
color_mask = imread("xuexiaoban.jpg")
wc = WordCloud(
font_path="C:/Windows/Fonts/msyh.ttc",
background_color="white",
max_words=2000,
stopwords=STOPWORDS.add("said"),
mask=color_mask,
max_font_size=200,
random_state=100
)
image_colors = ImageColorGenerator(color_mask)
word_cloud = wc.generate(keywords_str)
plt.imshow(word_cloud.recolor(color_func=image_colors))
plt.axis('off')
plt.show()
def get_words(cookies, g_tk, qzonetoken):
words = []
session = requests.session()
i = 0
while True:
params = {
'uin': user,
'pos': i * 20,
'num': 20,
'replynum': 100,
'g_tk': [g_tk, g_tk],
'callback': '_preloadCallback',
'code_version': '1',
'format': 'jsonp',
'need_private_comment': '1',
'qzonetoken': qzonetoken
}
respond = session.get('https://h5.qzone.qq.com/proxy/domain/taotao.qq.com/cgi-bin/emotion_cgi_msglist_v6',
params=params, headers=headers, cookies=cookies)
data_str = re.findall('^_preloadCallback\((.*?)\);$', respond.text)[0]
data_dict = json.loads(data_str)
msg_list = data_dict['msglist']
for msg in msg_list:
word = msg['content'].replace('\n', '')
# 除去表情
emojis = re.findall('\[em\].*?\[/em\]', word)
for emoji in emojis:
word = word.replace(emoji, '')
# 除去@
friends = re.findall('@{.*?}', word)
for friend in friends:
word = word.replace(friend, '')
words.append(word)
if len(msg_list) < 20:
return words
i += 1
def main():
cookies, qzonetoken = login()
g_tk = get_g_tk(cookies)
words = get_words(cookies, g_tk, qzonetoken)
keywords_str = get_keywords(words)
make_cloud(keywords_str)
if __name__ == '__main__':
main()