前言
本文介绍如何使用Python对微博评论数据进行分析。首先看一下评论数据的结构和字段,如下图:
字段有微博的发布时间、微博内容、认证类型、所属ip地址等。
看效果
1、词云图
分别用wordcloud库和pyecharts绘制词云图,效果分别如下:
2、 高频词条形图
3、情感分析饼图
4、认证类型统计
5、微博发布者省份占比饼图
撸代码
主要使用pyecharts和matplotlib绘制词云图、高频词条形图、情感分析饼状图、认证类型统计条形图、微博发布者所属省份占比以及词频统计等。直接上完整代码:
import re
import pandas as pd
import numpy as np
import os
from snownlp import SnowNLP
import jieba
from imageio.v2 import imread
from wordcloud import WordCloud as WC
import matplotlib.pyplot as plt
from pyecharts import options as opts
from pyecharts.globals import SymbolType
from pyecharts.charts import Bar, Page, WordCloud, Pie
from pyecharts.globals import ThemeType
import jieba.posseg as pseg
import networkx as nx
from pyecharts.faker import Faker
import matplotlib
matplotlib.use('TkAgg')
plt.rcParams['font.sans-serif'] = ['SimHei'] # 解决中文乱码问题
plt.rcParams['axes.unicode_minus'] = False # 解决负号不显示问题
class WeiboAnalysis:
def __init__(self, filename):
self.df = pd.read_csv(filename)
self.df.fillna(value={'认证类型': '无认证'}, inplace=True)
self.df.dropna(subset=['微博内容'], inplace=True) # 删除存在空值的行
# 创建停用词表
self.stop_list = [line.strip() for line in open('./files/哈工大停用词表.txt', encoding='utf8').readlines()]
self.text = ''.join(self.df['微博内容'])
self.jpg = imread('./files/Background.jpg')
def emotionAnalysis(self):
"""
情感分析
"""
emotion_list = []
for idx, comment in enumerate(self.df['微博内容']):
print(f'正在对第{idx}条数据情感分析...')
emotion_list.append(SnowNLP(str(comment)).sentiments)
self.df['情感分析'] = emotion_list
def cut_words(self):
# 文本分词
mytext_list = []
number_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
words = jieba.lcut(self.text, cut_all=False)
for seg in words:
# seg = self.data_prepossing(seg) # 去除表情和无效字符
for i in number_list:
seg = seg.replace(str(i), '')
if not seg:
continue
if seg not in self.stop_list and seg != " " and len(seg) != 1:
mytext_list.append(seg)
cloud_text = ",".join(mytext_list)
return cloud_text
def word_frequency(self, cloud_text):
# 统计并返回每个单词出现的次数
word_list = cloud_text.split(',')
d = {}
for word in word_list:
if word in d:
d[word] += 1
else:
d[word] = 1
# 删除词频小于2的关键词
for key, value in dict(d).items():
if value < 2:
del d[key]
frequency_result = list(d.items())
# 对关键词进行排序
frequency_result.sort(key=lambda x: x[1], reverse=True)
# 输出词频统计表
df_frequency = pd.DataFrame(frequency_result, columns=['词', '词频'])
df_frequency.to_csv('./data/词频统计.csv', encoding='utf-8-sig', index=False)
return frequency_result
def provide_data(self, frequency_result):
x_data = []
y_data = []
for item in frequency_result:
x_data.append(item[0])
y_data.append(item[1])
return x_data, y_data
def draw_wordcloud(self, xdata, ydata, cloud_text):
# 绘制词云
wordcloud = WC(
mask=self.jpg,
background_color="white",
font_path='./files/msyh.ttf',
width=1600,
height=1200,
margin=20,
max_words=50,
max_font_size=300,
min_font_size=20
).generate(cloud_text)
plt.figure(figsize=(15, 9))
plt.imshow(wordcloud)
# 去除坐标轴
plt.axis("off")
# plt.show()
plt.savefig("./data/WordCloud.jpg")
def draw_wordcloud_pyecharts(self):
df_cipin = pd.read_csv('./data/词频统计.csv')
w = (
WordCloud(init_opts=opts.InitOpts(
theme=ThemeType.ESSOS,
# 设置动画
animation_opts=opts.AnimationOpts(
animation_delay=1000,
animation_easing="elasticOut"),
# 设置宽度、高度
width='1500px',
height='1000px',
))
.add("", df_cipin.values[:50], word_size_range=[20, 100])
.set_global_opts(title_opts=opts.TitleOpts(title="词云图"))
.render("./data/wordcloud.html")
)
def data_prepossing(self, comment):
"""
数据内容去除无效字符,预处理
"""
sigmod_list = [',', '。', '(', ')', '-', '——', '\n', '“', '”', '*', '#', '《', '》', '、', '[', ']', '(', ')', '-',
'.', '/', '】', '【', '……', '!', '!', ':', ':', '…', '@', '~@', '~', '「一」', '「', '」',
'?', '"', '?', '~', '_', ' ', ';', '◆', '①', '②', '③', '④', '⑤', '⑥', '⑦', '⑧', '⑨', '⑩',
'⑾', '⑿', '⒀', '⒁', '⒂', '&quot;', '●', '❤', '❤️', '⭐️', '☀️', '♂️']
for one_sigmod in sigmod_list:
comment = comment.replace(one_sigmod, '')
comment = self.filter_emoji(comment)
comment = self.proc_text(comment) # 去除停用词、非中文字符
if 'http' in comment:
one_line = comment.split('http')[0].strip()
if 'https' in comment:
comment = comment.split('https')[0].strip()
return comment
def filter_emoji(self, desstr, restr=''):
# 过滤表情
try:
co = re.compile(u'[\U00010000-\U0010ffff]')
except re.error:
co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
return co.sub(restr, desstr)
def draw_emotion_pie(self):
count_pos = int(self.df[self.df['情感分析'] > 0.5].shape[0])
count_neg = int(self.df[self.df['情感分析'] < 0.5].shape[0])
count_mid = int(self.df[self.df['情感分析'] == 0.5].shape[0])
x_data = ['积极', '消极', '中性']
c = (
Pie(init_opts=opts.InitOpts(theme=ThemeType.LIGHT))
.add("", [list(z) for z in zip(x_data, [count_pos, count_neg, count_mid])])
.set_global_opts(title_opts=opts.TitleOpts(title="情感分析饼状图"),
toolbox_opts=opts.ToolboxOpts(is_show=True, feature=opts.ToolBoxFeatureOpts(
save_as_image=opts.ToolBoxFeatureSaveAsImageOpts(background_color="white"))), # 显示工具
) # 标题
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {d}% ")) # 数据标签设置,如需显示数量不显示百分比,将{d}%改为{c}
.render('./data/情感分析饼状图.html')
)
def draw_high_frequency(self, x_data, y_data):
c = (
Bar(init_opts=opts.InitOpts(theme=ThemeType.LIGHT))
.add_xaxis(x_data[:10])
.add_yaxis("关键词", y_axis=y_data[:10], stack="stack1")
.set_global_opts(
title_opts=opts.TitleOpts(title="高频词条形图"),
toolbox_opts=opts.ToolboxOpts(),
legend_opts=opts.LegendOpts(is_show=True, pos_left='center', pos_top='top', item_width=25,
item_height=25),
xaxis_opts=opts.AxisOpts(name='关键词',
name_textstyle_opts=opts.TextStyleOpts(color='red', font_size=20),
axislabel_opts=opts.LabelOpts(font_size=15, rotate=-15)),
yaxis_opts=opts.AxisOpts(name='数量', name_textstyle_opts=opts.TextStyleOpts(color='red', font_size=20),
axislabel_opts=opts.LabelOpts(font_size=15),
name_location="middle")
)
.set_series_opts(label_opts=opts.LabelOpts(is_show=True, position='top', color='black', font_size=15))
.render('./data/高频词条形图.html')
)
def draw_ip_analysis_bar(self):
self.df['locations'] = [str(i).split(' ')[0] for i in self.df['所属IP地址']]
x = self.df['locations'].value_counts().sort_values(ascending=False).index.tolist()
y = self.df['locations'].value_counts().sort_values(ascending=False).values.tolist()
c = (
Bar(init_opts=opts.InitOpts(theme=ThemeType.LIGHT))
.add_xaxis(x[:10])
.add_yaxis("数量", y_axis=y[:10], stack="stack1")
.set_global_opts(
title_opts=opts.TitleOpts(title="微博发布者省份前十统计图"),
toolbox_opts=opts.ToolboxOpts(),
legend_opts=opts.LegendOpts(is_show=True, pos_left='center', pos_top='top', item_width=25,
item_height=25),
xaxis_opts=opts.AxisOpts(name='省份',
name_textstyle_opts=opts.TextStyleOpts(color='red', font_size=20),
axislabel_opts=opts.LabelOpts(font_size=15, rotate=-15)),
yaxis_opts=opts.AxisOpts(name='数量', name_textstyle_opts=opts.TextStyleOpts(color='red', font_size=20),
axislabel_opts=opts.LabelOpts(font_size=15),
name_location="middle")
)
.set_series_opts(label_opts=opts.LabelOpts(is_show=True, position='top', color='black', font_size=15))
.render('./data/微博发布者省份前十统计图.html')
)
def draw_ip_analysis_pie(self):
x = self.df['locations'].value_counts().sort_values(ascending=False).index.tolist()
y = self.df['locations'].value_counts().sort_values(ascending=False).values.tolist()
pie = Pie(init_opts=opts.InitOpts(
theme=ThemeType.LIGHT
))
pie.add(
series_name="",
data_pair=[list(z) for z in zip(x, y)],
radius=["25%", "50%"], # 饼图内圈和外圈的大小比例
center=["60%", "50%"], # 饼图的位置:左边距和上边距
label_opts=opts.LabelOpts(is_show=True, formatter="{b} {d}%"), # 显示数据和百分比
)
pie.set_global_opts(title_opts=opts.TitleOpts(title="微博发布者所属省份占比", pos_left='40%'),
legend_opts=opts.LegendOpts(is_show=True, type_='scroll', pos_left="left", orient="vertical"))
pie.set_series_opts(
tooltip_opts=opts.TooltipOpts(
trigger="item", formatter="{a} <br/>{b}: {c} ({d}%)"
))
pie.render('./data/微博发布者所属省份占比.html')
def authentication_type(self):
x_data = self.df['认证类型'].value_counts().index.tolist()
y_data = self.df['认证类型'].value_counts().values.tolist()
c = (
Bar(init_opts=opts.InitOpts(theme=ThemeType.LIGHT))
.add_xaxis(x_data)
.add_yaxis("数量", y_axis=y_data, stack="stack1", category_gap='50%')
.set_global_opts(
title_opts=opts.TitleOpts(title="认证类型统计"),
toolbox_opts=opts.ToolboxOpts(),
legend_opts=opts.LegendOpts(is_show=True, pos_left='center', pos_top='top', item_width=25,
item_height=25),
)
.set_series_opts(label_opts=opts.LabelOpts(is_show=True, position='top', color='black', font_size=15))
.render('./data/认证类型.html')
)
def run(self):
self.emotionAnalysis()
cloud_text = self.cut_words()
frequency_result = self.word_frequency(cloud_text)
x_data, y_data = self.provide_data(frequency_result)
self.draw_wordcloud(x_data, y_data, cloud_text)
self.draw_wordcloud_pyecharts()
self.draw_emotion_pie()
self.draw_high_frequency(x_data, y_data)
self.draw_ip_analysis_bar()
self.draw_ip_analysis_pie()
self.authentication_type()
self.df.to_csv('./data/情感分析.csv', encoding='utf-8-sig')
if __name__ == '__main__':
if not os.path.exists('./data'):
os.mkdir('./data')
filename = '广州东山口微博.csv'
WA = WeiboAnalysis(filename)
WA.run()