import os
import jieba
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from pyecharts.charts import Page, Sankey, WordCloud, Radar
from pyecharts.components import Image
from pyecharts.options import ComponentTitleOpts
from collections import Counter
from pyecharts.globals import SymbolType
from pyecharts import options as opts
from pyecharts.options.global_options import ThemeType
from pyecharts import options as opts
from collections import Counter
import random
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100,默认为50
pd.set_option('max_colwidth',100)
def get_cut_words(content_series):
# 读入停用词表
import jieba
stop_words = []
with open("data/stopwords.txt", 'r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
stop_words.append(line.strip())
# 添加关键词
my_words = ['杜华', '辣鸡', '导演组', '节目组', '不公平', '黄圣依', '无杜华版']
for i in my_words:
jieba.add_word(i)
my_words2 = my_words_list
for j in my_words2:
jieba.add_word(j)
# 自定义停用词
my_stop_words = ['第一期', '一堆', '三个', '真的', '哈哈哈', '哈哈哈哈', '啊啊啊']
stop_words.extend(my_stop_words)
# 分词
word_num = jieba.lcut(content_series.str.cat(sep='。'), cut_all=False)
# 条件筛选
word_num_selected = [i for i in word_num if i not in stop_words and len(i) >= 2]
return word_num_selected
def show_all():
page = Page()
page.add(
image1,
wc,
radar
)
out_html = 'data/sister/%s.html' % name
page.render(out_html)
# 修改样式
with open(os.path.join(os.path.abspath("."), out_html), 'r+', encoding="utf8") as html:
html_bf = BeautifulSoup(html, "lxml")
divs = html_bf.find_all("div")
print(len(divs))
divs[0][
"style"] = "align=\"center\";margin:0 auto;text-align:center;"
divs[1][
"style"] = "width:550px;height:350px;position:absolute;top:120px;left:700px;border-style:solid;border-color:#444444;border-width:0px;" # 修改图表大小、位置、边框
divs[2][
"style"] = "width:700px;height:700px;position:absolute;top:120px;left:20px;border-style:solid;border-color:#444444;border-width:0px;" # 修改图表大小、位置、边框
divs[3][
"style"] = "width:600px;height:400px;position:absolute;top:300px;left:1250px;border-style:solid;border-color:#444444;border-width:0px;" # 修改图表大小、位置、边框
# 修改页面背景色
body = html_bf.find("body")
body["style"] = "background-color:#333333;"
# 追加标题
div_title = "
body.insert(0, BeautifulSoup(div_title, "lxml").div)
html_new = str(html_bf)
html.seek(0, 0)
html.truncate()
html.write(html_new)
html.close()
df = pd.read_csv('data/sister_data.csv', encoding='utf-8', sep='\t')
# 弹幕数据
df_all = pd.DataFrame()
for i in range(1, 9):
tmp = pd.read_csv('sister/sister/danmu_info_%d.csv' % i, encoding='utf-8', sep='\t')
df_all = df_all.append(tmp)
# print(df['names'].tolist())
df.sort_values('总分', ascending=False, inplace=True)
# 昵称
df['昵称'] = ['蓝盈莹|盈莹', '黄龄', '丹妮', '孟佳', '梦辰',
'可唯', '宁静|静静子|静姐', '霏霏', '希怡', '袁咏琳',
'圣依|依依子', '金晨', '阿朵', '含韵', '白冰',
'钟丽缇', '茜|茜茜子', '张萌|萌萌子', '婧汐', '丁当',
'许飞', '刘芸|芸芸子', '吴昕|昕昕子|昕姐|昕昕', '伊能静', '松伶',
'丽坤', '张雨绮|雨绮|绮绮子', '海陆', '金莎', '王智']
print(df.head(5))
print(df.columns)
for name in df.names.tolist():
image1 = Image()
img_src = (
"../img/%s.jpg" % name # html 路径问题
)
image1.add(
src=img_src,
style_opts={"width": "345px", "height": "584px", "style": "margin-top: 15px"},
)
image1.set_global_opts(
title_opts=ComponentTitleOpts(
title_style={"style": "color: white; font-size: 18px; font-weight:bold;"},
subtitle_style={"style": "color: white;font-size: 12px;"})
)
# 雷达图
value = df[["个人特质", "声乐表现力", "成团潜力", "舞台表现力"]][df.names == name].values[0]
data = [{"value": [float(i) for i in value], "name": "分数"}]
c_schema = [
{"name": "个人特质", "max": 25, "min": 0},
{"name": "声乐表现力", "max": 25, "min": 0},
{"name": "成团潜力", "max": 25, "min": 0},
{"name": "舞台表现力", "max": 25, "min": 0},
]
radar = (
Radar()
.set_colors(["#4587E7"])
.add_schema(
schema=c_schema,
shape="circle",
center=["50%", "50%"],
radius="80%",
angleaxis_opts=opts.AngleAxisOpts(
min_=0,
max_=360,
is_clockwise=False,
interval=5,
axistick_opts=opts.AxisTickOpts(is_show=False),
axislabel_opts=opts.LabelOpts(is_show=False),
axisline_opts=opts.AxisLineOpts(is_show=False),
splitline_opts=opts.SplitLineOpts(is_show=False),
),
radiusaxis_opts=opts.RadiusAxisOpts(
min_=0,
max_=25,
interval=5,
splitarea_opts=opts.SplitAreaOpts(
is_show=True, areastyle_opts=opts.AreaStyleOpts(opacity=1)
),
),
polar_opts=opts.PolarOpts(),
splitarea_opt=opts.SplitAreaOpts(is_show=False),
splitline_opt=opts.SplitLineOpts(is_show=False),
)
.add(
series_name="分数",
data=data,
color="#f9713c",
areastyle_opts=opts.AreaStyleOpts(opacity=0.1),
linestyle_opts=opts.LineStyleOpts(width=1),
)
)
# 弹幕词云
tmp = df[df.names == name]
my_words_list = df.昵称.str.cat(sep='。').replace('|', '。').split('。')
print(tmp.昵称.values[0])
text1 = get_cut_words(content_series=df_all.content[df_all.content.str.contains(tmp.昵称.values[0])])
wordCount_dict = Counter(text1)
choices_number = 200
count_list = sorted(wordCount_dict.items(), key=lambda x:x[1],reverse=True)
count_list = count_list[:choices_number]
keyword_list = [k[0] for k in count_list]
value_list = [k[1] for k in count_list]
wc = (
WordCloud()
.add(series_name="弹幕词云", data_pair=count_list, word_size_range=[20, 100],
textstyle_opts=opts.TextStyleOpts(font_family="cursive"),shape=SymbolType.DIAMOND)
.set_global_opts(
tooltip_opts=opts.TooltipOpts(is_show=True),
)
)
show_all()