主要的功能有
爬取
评论地址
爬取次数
爬取什么评论
爬取时间
保存名称
文件管理
爬取过程有问题或者查看数据
删除文件
重命名
下载文件
上传文件
处理
简单处理,去除换行符去除表情符号
删除列
可视化
饼图
聚合方法
count
sum
mean
标题
保存
词云图
对词云图参数修改
折线图
时间数量
文本感情处理
评论情感分析
情感分析可视化
关键词提取
主题个数
迭代次数
import streamlit as st
import pymysql
import pandas as pd
import streamlit.components.v1 as components
import mysql.connector
from wordcloud import WordCloud
from schedule import every, repeat, run_pending
from PIL import Image
import time
import io
import matplotlib.pyplot as plt
from pyecharts import options as opts
from pyecharts.charts import Bar,Pie,Line
import numpy as np
import os
from os import path
import requests
import csv
import re
import json
import streamlit_echarts as ste #ste.st_pyecharts(pie)
import jieba
import gensim
from gensim import corpora
from snownlp import SnowNLP
cnx = mysql.connector.connect(
host="localhost",
user="1234",
password="12345678Aa!",
database="login",
auth_plugin = 'mysql_native_password'
)
address = "E:/桌面/京东数据"
def login(data):
if data == 0:
# 显示登录表单
st.sidebar.subheader('登录界面')
b = st.sidebar.selectbox('用户', ["普通用户","管理员"])
username = st.sidebar.text_input("用户名")
password = st.sidebar.text_input("密码", type="password")
# 处理登录逻辑
if b =="管理员":
tableid = 'master'
else:
tableid = 'login'
if st.sidebar.button("登录"):
cursor = cnx.cursor()
query = "SELECT * FROM "+tableid+" WHERE username = %s AND password = %s"
cursor.execute(query, (username, password))
result = cursor.fetchone()
cursor.close()
if result:
st.sidebar.success("登录成功!")
run_pending()
data = 1
if tableid == 'master':
data =2
return data
else:
st.sidebar.error("用户名或密码错误。")
if st.sidebar.button("注册"):
cursor = cnx.cursor()
query = "INSERT INTO login (username, password) VALUES (%s, %s)"
cursor.execute(query, (username, password))
cnx.commit()
st.sidebar.success("注册成功!")
cursor.close()
else:
pass
def spider(spider):
if spider == "评论数据爬取":
address_web = st.text_input("网站地址", 'null')
review = st.selectbox("选择方法", ["无","全部评论","好评","中评","差评"])
if review =="全部评论":
review = 0
elif review =="好评":
review = 3
elif review =="中评":
review = 2
elif review =="差评":
review = 1
elif review =="无":
pass
else:
st.write("出错了爬取全部评论")
review =0
num = st.text_input("默认上限100", "100")
name = st.text_input("文件名称默认数据", "数据")
speed = st.select_slider("延迟爬取速度默认3秒", options=np.linspace(3,1,11))
spider_true = st.selectbox("是否爬取", ["否","是"])
if address_web!="null" and review!="无" and spider_true =="是":
comment_url = 'https://club.jd.com/comment/productPageComments.action'
csv_file = address+"/"+name+'.csv'
# print(csv_file)
f = open(csv_file, 'w', newline='', encoding='utf-8-sig') # 文件名可以根据不同更改
fieldnames = ['评论', '评论时间', '评分', '颜色', '规格', '购买时间', '用户', '型号', '产品名称', '评论间隔']
csvwriter = csv.DictWriter(f, fieldnames=fieldnames)
csvwriter.writeheader()
for i in range(int(num)):
st.write('正在获取第', i + 1, '页评论')
page = i
params = {
'productId': address_web, # 此处为不同手机的id,每个手机不同
'score': review,
'sortType': 6,
'page': page,
'pageSize': 10,
'callback': 'fetchJSON_comment98vv61',
'isShadowSku': 0,
'fold': 1
}
headers = {
'cookie': 'shshshfpa=980322f4-0d72-08ea-9cb2-4fcadde80a00-1562576627; shshshfpb=ymAFpsvPn5OjLe2TxXJVyZQ==; __jdu=16150341377512100580391; mt_xid=V2_52007VwMVUllZUF8fSx9aAWcAElNcXFtbHUEZbAYwVhdbDVkCRh9AEFsZYgdBBkEIVw1IVUlbA24KQVEPXFcIGnkaXQZnHxNaQVhbSx5AElgAbAITYl9oUWocSB9UAGIzEVVdXg==; unpl=V2_ZzNtbUBVREUmC0QBfkkMDGJRQlwSV0ATIQFGUnIZCwBnABRYclRCFnUUR1xnGl4UZwYZXEtcQRBFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZHseXAFmARddQFFFEXULRlV6HVUEZQsSbXJQcyVFDENceRhbNWYzE20AAx8TcwpBVX9UXAJnBxNfR1dBE3MMRld7GF0BbgIQVUJnQiV2; PCSYCityID=CN_110000_110100_110108; user-key=0245721f-bdeb-4f17-9fd2-b5e647ad7f3e; jwotest_product=99; __jdc=122270672; mba_muid=16150341377512100580391; wlfstk_smdl=ey5hfakeb6smwvr1ld305bkzf79ajgrx; areaId=1; ipLoc-djd=1-2800-55811-0; __jdv=122270672|baidu|-|organic|not set|1632740808675; token=48ce2d01d299337c932ec85a1154c65f,2,907080; __tk=vS2xv3k1ush1u3kxvSloXsa0YznovSTFXUawXSawushwXpJyupq0vG,2,907080; shshshfp=3da682e079013c4b17a9db085fb01ea3; shshshsID=2ee3081dbf26e0d2b12dfe9ebf1ac9a8_1_1632744359396; __jda=122270672.16150341377512100580391.1615034138.1632740809.1632744359.28; __jdb=122270672.1.16150341377512100580391|28.1632744359; 3AB9D23F7A4B3C9B=OOGFR7VEBOKC3KPZ6KF3FKUOPTYV2UTP6I26CTJWT6CBR7KDFT6DA7AKGYBOIC5VE3AGWVCO44IPRLJZQM5VPBDKRE; JSESSIONID=82C0F348483686AC9A673E31126675D3.s1',
'referer': 'https://item.jd.com/',
'accept-charset': 'UTF-8',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
}
resp = requests.get(comment_url, params=params, headers=headers)
if resp.status_code == requests.codes.ok:
regex = re.compile(r'fetchJSON_comment98vv61\((.*?)\);')
json_str = regex.search(resp.text).group(1)
json_dict = json.loads(json_str)
for item in json_dict['comments']:
content = item.get('content', '')
creationTime = item.get('creationTime', '')
score = item.get('score', '')
productColor = item.get('productColor', '')
productSize = item.get('productSize', '')
referenceTime = item.get('referenceTime', '')
nickname = item.get('nickname', '')
productSales = item.get('productSales', '')
referenceName = item.get('referenceName', '')
days = item.get('days', '')
# afterUserComment = item.get('afterUserComment', '').get("content","")
# 处理评论发布时间
date = time.strptime(creationTime, '%Y-%m-%d %H:%M:%S')
creationTime = time.strftime('%Y-%m-%d %H:%M:%S', date)
csvwriter.writerow({
'评论': content,
'评论时间': creationTime,
'评分': score,
'颜色': productColor,
'规格': productSize,
'购买时间': referenceTime,
'用户': nickname,
'型号': productSales,
'产品名称': referenceName,
'评论间隔': days
})
# print('添加评论:', content)
if item in json_dict['comments']:
time.sleep(float(speed))
else:
i -= 1
break
f.close()
print('评论抓取完成,共', i + 1, '页评论')
st.write('评论抓取完成,共', i + 1, '页评论')
elif spider_true =="是":
st.write("缺少条件无法爬取")
else:
pass
def document(document):
if document == "文件管理":
st.write("文件管理地址:",address)
file = os.listdir(address)
show_true = st.selectbox("是否显示管理目录下的文件", ["否", "是"])
if show_true =="是":
for i in file:
stats = os.stat(address + "/" + i)
st.write(i, "%.2f" % (stats.st_size / 1024 / 1024), "MB")
else:
pass
view = st.selectbox("查看文件",file)
if view.split(".")[1] == "png" or view.split(".")[1] == "jpg":
image = Image.open(address+"/"+view)
st.image(image, use_column_width=True)
if view.split(".")[1] == "jpg":
st.write("识别到是jpg图片将转换成png格式下载")
with open(address+"/"+view, 'rb') as f:
img = Image.open(f)
img_bytes = io.BytesIO()
img.save(img_bytes, format='PNG')
img_bytes = img_bytes.getvalue()
st.download_button(
label="下载图片",
data=img_bytes,
file_name="图片.png",
mime="image/png"
)
elif view.split(".")[1] == "txt":
encoding = st.selectbox("文件字符类型", ("UTF-8","GBK"))
with open(address + "/" + view, "r",encoding=encoding) as f:
text = f.read()
st.write(text)
f.close()
st.download_button(
label="下载文本",
data=text,
file_name="文本.txt",
mime="application/octet-stream"
)
elif view.split(".")[1] == "html":
text = ""
with open(address+'/'+view,encoding="utf-8")as fp:
text = fp.read()
components.html(text, height=550, width=1000)
elif view.split(".")[1] == "csv":
encoding = st.selectbox("文件字符类型", ("UTF-8","GBK"))
data = pd.read_csv(address + "/" + view,encoding=encoding)
n = st.select_slider("滑动显示数量", options=list(range(5,data.shape[0]+1)))
st.dataframe(data[0:n])
st.download_button(
label="下载CSV文件",
data=data.to_csv().encode('utf-8'),
file_name='下载.csv',
mime='text/csv'
)
elif view.split(".")[1] == "xlsx" or view.split(".")[1] == "xls":
num = st.selectbox("读取子表", ("0","1","2","3","4","5","6","7","8"))
data = pd.read_excel(address + "/" + view,sheet_name=int(num))
st.dataframe(data)
st.download_button(
label="下载CSV文件",
data=data.to_csv().encode('utf-8'),
file_name='下载.csv',
mime='text/csv'
)
else:
st.write("文件不能读取")
pass
rename_true = st.selectbox("是否重命名", ("否", "是"))
if rename_true == "是":
choose = st.selectbox("选择文件重命名", file)
rename = st.text_input('重新命名', '')
if st.button("重命名"):
if len(choose.split(".")) ==2:
os.rename(address + "/" + choose,address + "/" +rename+"."+choose.split(".")[1])
else:
os.rename(address + "/" + choose, address + "/" + rename)
st.write("重命名成功!")
else:
pass
drop_true = st.selectbox("是否删除文件", ("否","是"))
if drop_true =="是":
drop = st.selectbox("删除文件", file)
if st.button("删除文件"):
os.remove(address + "/" + drop)
st.write("删除成功!")
else:
pass
uploaded_file = st.sidebar.file_uploader("上传文件")
if uploaded_file is not None:
st.sidebar.write("上传成功!",uploaded_file.type)
with open(address + "/"+uploaded_file.name, "wb") as f:
f.write(uploaded_file.getbuffer())
else:
pass
def dispose(dispose):
if dispose =="预处理":
st.write("一键预处理评论的换行符和表情")
file = os.listdir(address)
v = st.selectbox("选择预处理文件", file)
if v.split(".")[1] != "csv" :
st.write("选择的文件不是预处理的文件格式")
if v.split(".")[1] == "csv" :
if st.button("一键预处理"):
data = pd.read_csv(address+"/"+v)
data['评论'] = data['评论'].replace('\n', '', regex=True)
st.write("去除换行符")
data['评论'] = data['评论'].replace(r'&[a-zA-Z]+;', '', regex=True)
st.write("去除表情符号")
data.to_csv(address+"/"+v,index=False)
st.write("存储成功")
d_line = st.selectbox("删除一列", ("否", "是"))
if d_line == "否":
pass
elif d_line == "是":
data = pd.read_csv(address + "/" + v)
d_line_c = st.multiselect("选择列", data.columns)
if st.button("是否删除列"):
data.drop(columns=d_line_c, axis=1, inplace=True)
data.to_csv(address + "/" + v,index=False)
st.write("自动保存")
else:
pass
def pie(pie):
if pie == "聚合饼图":
file = os.listdir(address)
pie = st.selectbox("选择要可视化饼图的文件", file)
if pie.split(".")[1] != "csv":
st.write("选择的文件不是csv的文件格式")
if pie.split(".")[1] == "csv":
data = pd.read_csv(address + "/" + pie)
group = st.selectbox("选择列聚合", data.columns)
method = st.selectbox("选择列聚合方法", ("count","sum","mean"))
if method =="count":
column = data.groupby(group).count()
elif method =="sum":
column = data.groupby(group).sum()
elif method =="mean":
column = data.groupby(group).mean()
else:
column = data.groupby(group).count()
columns = []
for i in data.columns:
if i !=group:
columns.append(i)
l = st.selectbox("选择保留列", columns)
column = column[l]
title = st.text_input("输入标题","饼图")
c = column.values
c1 = column.index
c = c.tolist()
c1 = c1.tolist()
pie = (
Pie(init_opts=opts.InitOpts(width="800px", height="550px"))
.add("", [list(z) for z in zip(c1,c)])
.set_global_opts(title_opts=opts.TitleOpts(title=str(title), pos_top='10%'), \
toolbox_opts=opts.ToolboxOpts(is_show=True, pos_left='left', pos_top='bottom',
orient='vertical'))
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}({d}%)"))
)
ste.st_pyecharts(pie)
save_pie = st.selectbox("是否保存饼图到服务器目录", ["否", "是"])
if save_pie =="是":
name = st.text_input("保存名默认饼图","饼图")
if st.button("保存饼图"):
pie.render(address + "/" +name+".html")
st.write("保存成功!")
else:
pass
pass
else:
pass
def line(line):
if line == "时间折线图":
file = os.listdir(address)
line = st.selectbox("选择要可视化饼图的文件", file)
if line.split(".")[1] != "csv":
st.write("选择的文件不是csv的文件格式")
if line.split(".")[1] == "csv":
data = pd.read_csv(address + "/" + line)
data["购买时间"] = pd.to_datetime(data["购买时间"])
format = st.selectbox("选择时间格式",("Y","M","D") )
title = st.text_input("输入标题", "折线图")
time = data.groupby(pd.Grouper(key='购买时间', axis=0, freq=format)).count()
time = time["评论"]
data1 = {"时间": time.index, '次': time.values}
x_dt = data1["时间"].tolist()
y1 = data1["次"].tolist()
line_base = Line().add_xaxis(x_dt).add_yaxis("次数", y1, yaxis_index=0)
line_base.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
line_base.set_global_opts(title_opts=opts.TitleOpts(title=str(title), pos_top='5%'), \
toolbox_opts=opts.ToolboxOpts(is_show=True, pos_left='left', pos_top='bottom',
orient='vertical'))
ste.st_pyecharts(line_base)
save_line = st.selectbox("是否保存折线图到服务器目录", ["否", "是"])
if save_line == "是":
name = st.text_input("保存名默认折线图", "折线图")
if st.button("保存折线图"):
line_base.render(address + "/" + name + ".html")
st.write("保存成功!")
else:
pass
else:
pass
def cloud(cloud):
if cloud == "词云图":
file = os.listdir(address)
line = st.selectbox("选择要可视化饼图的文件", file)
if line.split(".")[1] != "csv":
st.write("选择的文件不是csv的文件格式")
if line.split(".")[1] == "csv":
height = st.text_input("高", "800")
width = st.text_input("宽", "600")
max_font_size = st.text_input("最大字体大小", "100")
min_font_size = st.text_input("最小字体大小", "10")
max_words = st.text_input("最多显示多少词", "100")
background_color = st.text_input("背景颜色", "white")
font_path = st.text_input("字体", "simhei.ttf")
data = pd.read_csv(address + "/" + line)
text = ''
for i in data["评论"]:
text += i
lst = jieba.lcut(text)
stopwords = [line.strip() for line in
open(address +"/"+'stopwords.txt', 'r', encoding='utf-8').readlines()]
t = ''
for word in lst:
if word not in stopwords:
t += word + " "
wc = WordCloud(
font_path=font_path, # 字体路劲
background_color=background_color, # 背景颜色
width=int(width),
height=int(height),
max_font_size=int(max_font_size), # 字体大小
scale=2,
min_font_size=int(min_font_size),
# mask=plt.imread('G\map-background.jpg','#FFFFFF'), #背景图片
max_words=int(max_words)
)
wc.generate(t)
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
if st.button("显示云图"):
st.pyplot(plt)
save_c = st.selectbox("是否保存云图到服务器目录", ["否", "是"])
if save_c == "是":
name = st.text_input("保存名默认云图", "云图")
if st.button("保存云图"):
wc.to_file(address+"/"+'云图.png')
st.write("保存成功!")
else:
pass
else:
pass
def text(text):
if text =="评论情感分析":
file = os.listdir(address)
text = st.selectbox("选择要情感分析的文件", file)
if text.split(".")[1] != "csv":
st.write("选择的文件不是csv的文件格式")
if text.split(".")[1] == "csv":
emotion = st.selectbox("评论语句情感分析", ["否", "是"])
if emotion =="是":
file = os.listdir(address)
if "缓存.csv" in file:
st.write("发现服务器中有缓存文件(缓存文件是为了响应时间快)")
if st.button("删除缓存文件"):
os.remove(address + "/" + "缓存.csv")
st.write("删除成功!")
df =pd.read_csv(address+"/"+"缓存.csv")
list_s = df["评分"].tolist()
list_t = df["文本"].tolist()
else:
data = pd.read_csv(address + "/" + text)
list_s = []
list_t = []
stopwords = [line.strip() for line in
open(address + '/' + 'stopwords.txt', 'r', encoding='utf-8').readlines()]
for i in data["评论"]:
lst = jieba.lcut(i)
t = ''
for word in lst:
if word not in stopwords:
t += word
if t == '':
t = " "
s = SnowNLP(t)
# st.write(t,"情感倾向分数:\n",s.sentiments)
list_s.append(s.sentiments)
list_t.append(t)
df = pd.DataFrame(list_s, list_t)
df = df.reset_index()
st.write(df)
df.columns = ["文本","评分"]
df.to_csv(address + "/" + "缓存.csv",index=False)
st.write("平均情感分数为:", sum(list_s) / len(list_s))
df_n = st.select_slider("显示数量", options=list(range(5, df.shape[0] + 1)))
st.write(df[0:df_n])
show_contrast= st.selectbox("情感可视化",("否","是"))
format_emo = st.selectbox("时间格式", ("Y", "M", "D"))
if show_contrast == "是":
data = pd.read_csv(address + "/" + text)
score = pd.DataFrame(list_s)
emo = pd.concat([score, data], axis=1)
emo["购买时间"] = pd.to_datetime(emo["购买时间"])
emo_time = emo.groupby(pd.Grouper(key='购买时间', axis=0, freq=format_emo)).mean()
emo_time.columns = ["情感评分", "平均评分","0"]
emo_time = emo_time[["情感评分", "平均评分"]]
emo_time.fillna(0, inplace=True)
data["购买时间"] = pd.to_datetime(data["购买时间"])
time = data.groupby(pd.Grouper(key='购买时间', axis=0, freq=format_emo)).count()
time = time["评论"]
data1 = {"时间": time.index, '数量': time.values}
data2 = {"时间": time.index, '情感评分': emo_time["情感评分"]}
data3 = {"时间": time.index, '平均评分': emo_time["平均评分"]}
x_dt = data3["时间"].tolist()
y1 = data1["数量"].tolist()
y2 = data2["情感评分"].tolist()
y3 = data3["平均评分"].tolist()
line_base = Line().add_xaxis(x_dt).add_yaxis("数量", y1, yaxis_index=0)
line_base.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
line_base.set_global_opts(title_opts=opts.TitleOpts(title="情感评分、评论数量、平均分数与时间折线图", pos_top='10%'), \
toolbox_opts=opts.ToolboxOpts(is_show=True, pos_left='left',
pos_top='bottom', orient='vertical'))
line_base.extend_axis(yaxis=opts.AxisOpts())
line_2 = Line().add_xaxis(x_dt).add_yaxis("情感评分", y2, yaxis_index=1)
line_2.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
line_3 = Line().add_xaxis(x_dt).add_yaxis("平均评分", y3, yaxis_index=1)
line_3.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
line_base.overlap(line_2)
line_base.overlap(line_3)
ste.st_pyecharts(line_base)
save_line = st.selectbox("是否保存情感折线图到服务器目录", ["否", "是"])
if save_line == "是":
name = st.text_input("保存名默认情感折线图", "情感折线图")
if st.button("保存情感折线图"):
line_base.render(address + "/" + name + ".html")
st.write("保存成功!")
else:
pass
def keywords(keywords):
if keywords == "关键词提取":
file = os.listdir(address)
line = st.selectbox("选择要关键词的文件", file)
if line.split(".")[1] != "csv":
st.write("选择的文件不是csv的文件格式")
if line.split(".")[1] == "csv":
num = st.text_input("提取主题个数","10")
Iteration = st.text_input("迭代次数", "10")
key = st.selectbox("评论关键词分析", ["否", "是"])
if key == "是":
data = pd.read_csv(address + "/" + line)
text = ''
for i in data["评论"]:
text += i
lst = jieba.lcut(text)
stopwords = [line.strip() for line in
open(address+"/"+'stopwords.txt', 'r', encoding='utf-8').readlines()]
t = ''
for word in lst:
if word not in stopwords:
t += word + " "
sentences = gensim.utils.simple_preprocess(t)
# 转化成词袋模型
sentences_as_words = [gensim.utils.simple_preprocess(sentence) for sentence in sentences]
vocab = set([word for sentence in sentences_as_words for word in sentence])
dictionary = corpora.Dictionary(sentences_as_words)
corpus = [dictionary.doc2bow(sentence_words) for sentence_words in sentences_as_words]
# 进行主题建模
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=int(num),
passes=int(Iteration))
# 提取关键词
for topic in lda_model.print_topics():
st.write(topic)
else:
pass
else:
pass
if __name__ == '__main__':
f = open('t.txt', encoding='gbk')
data = int(f.read())
if data == 1:
if st.sidebar.button("退出登录"):
with open("t.txt", "w") as f:
f.write("0")
a = st.sidebar.selectbox("京东评论数据爬取", ("无", "评论数据爬取"))
spider(a)
b = st.sidebar.selectbox("服务器文件管理", ("无", "文件管理"))
document(b)
c = st.sidebar.selectbox("简单预处理", ("无", "预处理"))
dispose(c)
d = st.sidebar.selectbox("可视化处理", ("无", "聚合饼图","时间折线图","词云图"))
pie(d)
line(d)
cloud(d)
e = st.sidebar.selectbox("文本处理", ("无", "评论情感分析","关键词提取"))
text(e)
keywords(e)
if data ==2:
if st.sidebar.button("退出登录"):
with open("t.txt", "w") as f:
f.write("0")
if data == 0:
st.title("京东爬虫分析系统交互界面")
image = Image.open(address+"/"+"京东.png")
st.image(image, use_column_width=True)
data = login(data)
print(data)
if data == 1:
with open("t.txt", "w") as f:
f.write("1")
with open("food.txt", "w") as f:
f.write("1")
if data ==2:
with open("t.txt", "w") as f:
f.write("2")
with open("food.txt", "w") as f:
f.write("1")
import streamlit as st
import pymysql
import pandas as pd
import streamlit.components.v1 as components
import mysql.connector
from wordcloud import WordCloud
from schedule import every, repeat, run_pending
from PIL import Image
import time
import io
import matplotlib.pyplot as plt
from pyecharts import options as opts
from pyecharts.charts import Bar,Pie,Line
import numpy as np
import os
from os import path
import requests
import csv
import re
import json
import streamlit_echarts as ste #ste.st_pyecharts(pie)
import jieba
import gensim
from gensim import corpora
from snownlp import SnowNLP
cnx = mysql.connector.connect(
host="localhost",
user="1234",
password="12345678Aa!",
database="login",
auth_plugin = 'mysql_native_password'
)
address = "E:/桌面/京东数据"
def login(data):
if data == 0:
# 显示登录表单
st.sidebar.subheader('登录界面')
b = st.sidebar.selectbox('用户', ["普通用户","管理员"])
username = st.sidebar.text_input("用户名")
password = st.sidebar.text_input("密码", type="password")
# 处理登录逻辑
if b =="管理员":
tableid = 'master'
else:
tableid = 'login'
if st.sidebar.button("登录"):
cursor = cnx.cursor()
query = "SELECT * FROM "+tableid+" WHERE username = %s AND password = %s"
cursor.execute(query, (username, password))
result = cursor.fetchone()
cursor.close()
if result:
st.sidebar.success("登录成功!")
run_pending()
data = 1
if tableid == 'master':
data =2
return data
else:
st.sidebar.error("用户名或密码错误。")
if st.sidebar.button("注册"):
cursor = cnx.cursor()
query = "INSERT INTO login (username, password) VALUES (%s, %s)"
cursor.execute(query, (username, password))
cnx.commit()
st.sidebar.success("注册成功!")
cursor.close()
else:
pass
def spider(spider):
if spider == "评论数据爬取":
address_web = st.text_input("网站地址", 'null')
review = st.selectbox("选择方法", ["无","全部评论","好评","中评","差评"])
if review =="全部评论":
review = 0
elif review =="好评":
review = 3
elif review =="中评":
review = 2
elif review =="差评":
review = 1
elif review =="无":
pass
else:
st.write("出错了爬取全部评论")
review =0
num = st.text_input("默认上限100", "100")
name = st.text_input("文件名称默认数据", "数据")
speed = st.select_slider("延迟爬取速度默认3秒", options=np.linspace(3,1,11))
spider_true = st.selectbox("是否爬取", ["否","是"])
if address_web!="null" and review!="无" and spider_true =="是":
comment_url = 'https://club.jd.com/comment/productPageComments.action'
csv_file = address+"/"+name+'.csv'
# print(csv_file)
f = open(csv_file, 'w', newline='', encoding='utf-8-sig') # 文件名可以根据不同更改
fieldnames = ['评论', '评论时间', '评分', '颜色', '规格', '购买时间', '用户', '型号', '产品名称', '评论间隔']
csvwriter = csv.DictWriter(f, fieldnames=fieldnames)
csvwriter.writeheader()
for i in range(int(num)):
st.write('正在获取第', i + 1, '页评论')
page = i
params = {
'productId': address_web, # 此处为不同手机的id,每个手机不同
'score': review,
'sortType': 6,
'page': page,
'pageSize': 10,
'callback': 'fetchJSON_comment98vv61',
'isShadowSku': 0,
'fold': 1
}
headers = {
'cookie': 'shshshfpa=980322f4-0d72-08ea-9cb2-4fcadde80a00-1562576627; shshshfpb=ymAFpsvPn5OjLe2TxXJVyZQ==; __jdu=16150341377512100580391; mt_xid=V2_52007VwMVUllZUF8fSx9aAWcAElNcXFtbHUEZbAYwVhdbDVkCRh9AEFsZYgdBBkEIVw1IVUlbA24KQVEPXFcIGnkaXQZnHxNaQVhbSx5AElgAbAITYl9oUWocSB9UAGIzEVVdXg==; unpl=V2_ZzNtbUBVREUmC0QBfkkMDGJRQlwSV0ATIQFGUnIZCwBnABRYclRCFnUUR1xnGl4UZwYZXEtcQRBFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZHseXAFmARddQFFFEXULRlV6HVUEZQsSbXJQcyVFDENceRhbNWYzE20AAx8TcwpBVX9UXAJnBxNfR1dBE3MMRld7GF0BbgIQVUJnQiV2; PCSYCityID=CN_110000_110100_110108; user-key=0245721f-bdeb-4f17-9fd2-b5e647ad7f3e; jwotest_product=99; __jdc=122270672; mba_muid=16150341377512100580391; wlfstk_smdl=ey5hfakeb6smwvr1ld305bkzf79ajgrx; areaId=1; ipLoc-djd=1-2800-55811-0; __jdv=122270672|baidu|-|organic|not set|1632740808675; token=48ce2d01d299337c932ec85a1154c65f,2,907080; __tk=vS2xv3k1ush1u3kxvSloXsa0YznovSTFXUawXSawushwXpJyupq0vG,2,907080; shshshfp=3da682e079013c4b17a9db085fb01ea3; shshshsID=2ee3081dbf26e0d2b12dfe9ebf1ac9a8_1_1632744359396; __jda=122270672.16150341377512100580391.1615034138.1632740809.1632744359.28; __jdb=122270672.1.16150341377512100580391|28.1632744359; 3AB9D23F7A4B3C9B=OOGFR7VEBOKC3KPZ6KF3FKUOPTYV2UTP6I26CTJWT6CBR7KDFT6DA7AKGYBOIC5VE3AGWVCO44IPRLJZQM5VPBDKRE; JSESSIONID=82C0F348483686AC9A673E31126675D3.s1',
'referer': 'https://item.jd.com/',
'accept-charset': 'UTF-8',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
}
resp = requests.get(comment_url, params=params, headers=headers)
if resp.status_code == requests.codes.ok:
regex = re.compile(r'fetchJSON_comment98vv61\((.*?)\);')
json_str = regex.search(resp.text).group(1)
json_dict = json.loads(json_str)
for item in json_dict['comments']:
content = item.get('content', '')
creationTime = item.get('creationTime', '')
score = item.get('score', '')
productColor = item.get('productColor', '')
productSize = item.get('productSize', '')
referenceTime = item.get('referenceTime', '')
nickname = item.get('nickname', '')
productSales = item.get('productSales', '')
referenceName = item.get('referenceName', '')
days = item.get('days', '')
# afterUserComment = item.get('afterUserComment', '').get("content","")
# 处理评论发布时间
date = time.strptime(creationTime, '%Y-%m-%d %H:%M:%S')
creationTime = time.strftime('%Y-%m-%d %H:%M:%S', date)
csvwriter.writerow({
'评论': content,
'评论时间': creationTime,
'评分': score,
'颜色': productColor,
'规格': productSize,
'购买时间': referenceTime,
'用户': nickname,
'型号': productSales,
'产品名称': referenceName,
'评论间隔': days
})
# print('添加评论:', content)
if item in json_dict['comments']:
time.sleep(float(speed))
else:
i -= 1
break
f.close()
print('评论抓取完成,共', i + 1, '页评论')
st.write('评论抓取完成,共', i + 1, '页评论')
elif spider_true =="是":
st.write("缺少条件无法爬取")
else:
pass
def document(document):
if document == "文件管理":
st.write("文件管理地址:",address)
file = os.listdir(address)
show_true = st.selectbox("是否显示管理目录下的文件", ["否", "是"])
if show_true =="是":
for i in file:
stats = os.stat(address + "/" + i)
st.write(i, "%.2f" % (stats.st_size / 1024 / 1024), "MB")
else:
pass
view = st.selectbox("查看文件",file)
if view.split(".")[1] == "png" or view.split(".")[1] == "jpg":
image = Image.open(address+"/"+view)
st.image(image, use_column_width=True)
if view.split(".")[1] == "jpg":
st.write("识别到是jpg图片将转换成png格式下载")
with open(address+"/"+view, 'rb') as f:
img = Image.open(f)
img_bytes = io.BytesIO()
img.save(img_bytes, format='PNG')
img_bytes = img_bytes.getvalue()
st.download_button(
label="下载图片",
data=img_bytes,
file_name="图片.png",
mime="image/png"
)
elif view.split(".")[1] == "txt":
encoding = st.selectbox("文件字符类型", ("UTF-8","GBK"))
with open(address + "/" + view, "r",encoding=encoding) as f:
text = f.read()
st.write(text)
f.close()
st.download_button(
label="下载文本",
data=text,
file_name="文本.txt",
mime="application/octet-stream"
)
elif view.split(".")[1] == "html":
text = ""
with open(address+'/'+view,encoding="utf-8")as fp:
text = fp.read()
components.html(text, height=550, width=1000)
elif view.split(".")[1] == "csv":
encoding = st.selectbox("文件字符类型", ("UTF-8","GBK"))
data = pd.read_csv(address + "/" + view,encoding=encoding)
n = st.select_slider("滑动显示数量", options=list(range(5,data.shape[0]+1)))
st.dataframe(data[0:n])
st.download_button(
label="下载CSV文件",
data=data.to_csv().encode('utf-8'),
file_name='下载.csv',
mime='text/csv'
)
elif view.split(".")[1] == "xlsx" or view.split(".")[1] == "xls":
num = st.selectbox("读取子表", ("0","1","2","3","4","5","6","7","8"))
data = pd.read_excel(address + "/" + view,sheet_name=int(num))
st.dataframe(data)
st.download_button(
label="下载CSV文件",
data=data.to_csv().encode('utf-8'),
file_name='下载.csv',
mime='text/csv'
)
else:
st.write("文件不能读取")
pass
rename_true = st.selectbox("是否重命名", ("否", "是"))
if rename_true == "是":
choose = st.selectbox("选择文件重命名", file)
rename = st.text_input('重新命名', '')
if st.button("重命名"):
if len(choose.split(".")) ==2:
os.rename(address + "/" + choose,address + "/" +rename+"."+choose.split(".")[1])
else:
os.rename(address + "/" + choose, address + "/" + rename)
st.write("重命名成功!")
else:
pass
drop_true = st.selectbox("是否删除文件", ("否","是"))
if drop_true =="是":
drop = st.selectbox("删除文件", file)
if st.button("删除文件"):
os.remove(address + "/" + drop)
st.write("删除成功!")
else:
pass
uploaded_file = st.sidebar.file_uploader("上传文件")
if uploaded_file is not None:
st.sidebar.write("上传成功!",uploaded_file.type)
with open(address + "/"+uploaded_file.name, "wb") as f:
f.write(uploaded_file.getbuffer())
else:
pass
def dispose(dispose):
if dispose =="预处理":
st.write("一键预处理评论的换行符和表情")
file = os.listdir(address)
v = st.selectbox("选择预处理文件", file)
if v.split(".")[1] != "csv" :
st.write("选择的文件不是预处理的文件格式")
if v.split(".")[1] == "csv" :
if st.button("一键预处理"):
data = pd.read_csv(address+"/"+v)
data['评论'] = data['评论'].replace('\n', '', regex=True)
st.write("去除换行符")
data['评论'] = data['评论'].replace(r'&[a-zA-Z]+;', '', regex=True)
st.write("去除表情符号")
data.to_csv(address+"/"+v,index=False)
st.write("存储成功")
d_line = st.selectbox("删除一列", ("否", "是"))
if d_line == "否":
pass
elif d_line == "是":
data = pd.read_csv(address + "/" + v)
d_line_c = st.multiselect("选择列", data.columns)
if st.button("是否删除列"):
data.drop(columns=d_line_c, axis=1, inplace=True)
data.to_csv(address + "/" + v,index=False)
st.write("自动保存")
else:
pass
def pie(pie):
if pie == "聚合饼图":
file = os.listdir(address)
pie = st.selectbox("选择要可视化饼图的文件", file)
if pie.split(".")[1] != "csv":
st.write("选择的文件不是csv的文件格式")
if pie.split(".")[1] == "csv":
data = pd.read_csv(address + "/" + pie)
group = st.selectbox("选择列聚合", data.columns)
method = st.selectbox("选择列聚合方法", ("count","sum","mean"))
if method =="count":
column = data.groupby(group).count()
elif method =="sum":
column = data.groupby(group).sum()
elif method =="mean":
column = data.groupby(group).mean()
else:
column = data.groupby(group).count()
columns = []
for i in data.columns:
if i !=group:
columns.append(i)
l = st.selectbox("选择保留列", columns)
column = column[l]
title = st.text_input("输入标题","饼图")
c = column.values
c1 = column.index
c = c.tolist()
c1 = c1.tolist()
pie = (
Pie(init_opts=opts.InitOpts(width="800px", height="550px"))
.add("", [list(z) for z in zip(c1,c)])
.set_global_opts(title_opts=opts.TitleOpts(title=str(title), pos_top='10%'), \
toolbox_opts=opts.ToolboxOpts(is_show=True, pos_left='left', pos_top='bottom',
orient='vertical'))
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}({d}%)"))
)
ste.st_pyecharts(pie)
save_pie = st.selectbox("是否保存饼图到服务器目录", ["否", "是"])
if save_pie =="是":
name = st.text_input("保存名默认饼图","饼图")
if st.button("保存饼图"):
pie.render(address + "/" +name+".html")
st.write("保存成功!")
else:
pass
pass
else:
pass
def line(line):
if line == "时间折线图":
file = os.listdir(address)
line = st.selectbox("选择要可视化饼图的文件", file)
if line.split(".")[1] != "csv":
st.write("选择的文件不是csv的文件格式")
if line.split(".")[1] == "csv":
data = pd.read_csv(address + "/" + line)
data["购买时间"] = pd.to_datetime(data["购买时间"])
format = st.selectbox("选择时间格式",("Y","M","D") )
title = st.text_input("输入标题", "折线图")
time = data.groupby(pd.Grouper(key='购买时间', axis=0, freq=format)).count()
time = time["评论"]
data1 = {"时间": time.index, '次': time.values}
x_dt = data1["时间"].tolist()
y1 = data1["次"].tolist()
line_base = Line().add_xaxis(x_dt).add_yaxis("次数", y1, yaxis_index=0)
line_base.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
line_base.set_global_opts(title_opts=opts.TitleOpts(title=str(title), pos_top='5%'), \
toolbox_opts=opts.ToolboxOpts(is_show=True, pos_left='left', pos_top='bottom',
orient='vertical'))
ste.st_pyecharts(line_base)
save_line = st.selectbox("是否保存折线图到服务器目录", ["否", "是"])
if save_line == "是":
name = st.text_input("保存名默认折线图", "折线图")
if st.button("保存折线图"):
line_base.render(address + "/" + name + ".html")
st.write("保存成功!")
else:
pass
else:
pass
def cloud(cloud):
if cloud == "词云图":
file = os.listdir(address)
line = st.selectbox("选择要可视化饼图的文件", file)
if line.split(".")[1] != "csv":
st.write("选择的文件不是csv的文件格式")
if line.split(".")[1] == "csv":
height = st.text_input("高", "800")
width = st.text_input("宽", "600")
max_font_size = st.text_input("最大字体大小", "100")
min_font_size = st.text_input("最小字体大小", "10")
max_words = st.text_input("最多显示多少词", "100")
background_color = st.text_input("背景颜色", "white")
font_path = st.text_input("字体", "simhei.ttf")
data = pd.read_csv(address + "/" + line)
text = ''
for i in data["评论"]:
text += i
lst = jieba.lcut(text)
stopwords = [line.strip() for line in
open(address +"/"+'stopwords.txt', 'r', encoding='utf-8').readlines()]
t = ''
for word in lst:
if word not in stopwords:
t += word + " "
wc = WordCloud(
font_path=font_path, # 字体路劲
background_color=background_color, # 背景颜色
width=int(width),
height=int(height),
max_font_size=int(max_font_size), # 字体大小
scale=2,
min_font_size=int(min_font_size),
# mask=plt.imread('G\map-background.jpg','#FFFFFF'), #背景图片
max_words=int(max_words)
)
wc.generate(t)
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
if st.button("显示云图"):
st.pyplot(plt)
save_c = st.selectbox("是否保存云图到服务器目录", ["否", "是"])
if save_c == "是":
name = st.text_input("保存名默认云图", "云图")
if st.button("保存云图"):
wc.to_file(address+"/"+'云图.png')
st.write("保存成功!")
else:
pass
else:
pass
def text(text):
if text =="评论情感分析":
file = os.listdir(address)
text = st.selectbox("选择要情感分析的文件", file)
if text.split(".")[1] != "csv":
st.write("选择的文件不是csv的文件格式")
if text.split(".")[1] == "csv":
emotion = st.selectbox("评论语句情感分析", ["否", "是"])
if emotion =="是":
file = os.listdir(address)
if "缓存.csv" in file:
st.write("发现服务器中有缓存文件(缓存文件是为了响应时间快)")
if st.button("删除缓存文件"):
os.remove(address + "/" + "缓存.csv")
st.write("删除成功!")
df =pd.read_csv(address+"/"+"缓存.csv")
list_s = df["评分"].tolist()
list_t = df["文本"].tolist()
else:
data = pd.read_csv(address + "/" + text)
list_s = []
list_t = []
stopwords = [line.strip() for line in
open(address + '/' + 'stopwords.txt', 'r', encoding='utf-8').readlines()]
for i in data["评论"]:
lst = jieba.lcut(i)
t = ''
for word in lst:
if word not in stopwords:
t += word
if t == '':
t = " "
s = SnowNLP(t)
# st.write(t,"情感倾向分数:\n",s.sentiments)
list_s.append(s.sentiments)
list_t.append(t)
df = pd.DataFrame(list_s, list_t)
df = df.reset_index()
st.write(df)
df.columns = ["文本","评分"]
df.to_csv(address + "/" + "缓存.csv",index=False)
st.write("平均情感分数为:", sum(list_s) / len(list_s))
df_n = st.select_slider("显示数量", options=list(range(5, df.shape[0] + 1)))
st.write(df[0:df_n])
show_contrast= st.selectbox("情感可视化",("否","是"))
format_emo = st.selectbox("时间格式", ("Y", "M", "D"))
if show_contrast == "是":
data = pd.read_csv(address + "/" + text)
score = pd.DataFrame(list_s)
emo = pd.concat([score, data], axis=1)
emo["购买时间"] = pd.to_datetime(emo["购买时间"])
emo_time = emo.groupby(pd.Grouper(key='购买时间', axis=0, freq=format_emo)).mean()
emo_time.columns = ["情感评分", "平均评分","0"]
emo_time = emo_time[["情感评分", "平均评分"]]
emo_time.fillna(0, inplace=True)
data["购买时间"] = pd.to_datetime(data["购买时间"])
time = data.groupby(pd.Grouper(key='购买时间', axis=0, freq=format_emo)).count()
time = time["评论"]
data1 = {"时间": time.index, '数量': time.values}
data2 = {"时间": time.index, '情感评分': emo_time["情感评分"]}
data3 = {"时间": time.index, '平均评分': emo_time["平均评分"]}
x_dt = data3["时间"].tolist()
y1 = data1["数量"].tolist()
y2 = data2["情感评分"].tolist()
y3 = data3["平均评分"].tolist()
line_base = Line().add_xaxis(x_dt).add_yaxis("数量", y1, yaxis_index=0)
line_base.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
line_base.set_global_opts(title_opts=opts.TitleOpts(title="情感评分、评论数量、平均分数与时间折线图", pos_top='10%'), \
toolbox_opts=opts.ToolboxOpts(is_show=True, pos_left='left',
pos_top='bottom', orient='vertical'))
line_base.extend_axis(yaxis=opts.AxisOpts())
line_2 = Line().add_xaxis(x_dt).add_yaxis("情感评分", y2, yaxis_index=1)
line_2.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
line_3 = Line().add_xaxis(x_dt).add_yaxis("平均评分", y3, yaxis_index=1)
line_3.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
line_base.overlap(line_2)
line_base.overlap(line_3)
ste.st_pyecharts(line_base)
save_line = st.selectbox("是否保存情感折线图到服务器目录", ["否", "是"])
if save_line == "是":
name = st.text_input("保存名默认情感折线图", "情感折线图")
if st.button("保存情感折线图"):
line_base.render(address + "/" + name + ".html")
st.write("保存成功!")
else:
pass
def keywords(keywords):
if keywords == "关键词提取":
file = os.listdir(address)
line = st.selectbox("选择要关键词的文件", file)
if line.split(".")[1] != "csv":
st.write("选择的文件不是csv的文件格式")
if line.split(".")[1] == "csv":
num = st.text_input("提取主题个数","10")
Iteration = st.text_input("迭代次数", "10")
key = st.selectbox("评论关键词分析", ["否", "是"])
if key == "是":
data = pd.read_csv(address + "/" + line)
text = ''
for i in data["评论"]:
text += i
lst = jieba.lcut(text)
stopwords = [line.strip() for line in
open(address+"/"+'stopwords.txt', 'r', encoding='utf-8').readlines()]
t = ''
for word in lst:
if word not in stopwords:
t += word + " "
sentences = gensim.utils.simple_preprocess(t)
# 转化成词袋模型
sentences_as_words = [gensim.utils.simple_preprocess(sentence) for sentence in sentences]
vocab = set([word for sentence in sentences_as_words for word in sentence])
dictionary = corpora.Dictionary(sentences_as_words)
corpus = [dictionary.doc2bow(sentence_words) for sentence_words in sentences_as_words]
# 进行主题建模
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=int(num),
passes=int(Iteration))
# 提取关键词
for topic in lda_model.print_topics():
st.write(topic)
else:
pass
else:
pass
if __name__ == '__main__':
f = open('t.txt', encoding='gbk')
data = int(f.read())
if data == 1:
if st.sidebar.button("退出登录"):
with open("t.txt", "w") as f:
f.write("0")
a = st.sidebar.selectbox("京东评论数据爬取", ("无", "评论数据爬取"))
spider(a)
b = st.sidebar.selectbox("服务器文件管理", ("无", "文件管理"))
document(b)
c = st.sidebar.selectbox("简单预处理", ("无", "预处理"))
dispose(c)
d = st.sidebar.selectbox("可视化处理", ("无", "聚合饼图","时间折线图","词云图"))
pie(d)
line(d)
cloud(d)
e = st.sidebar.selectbox("文本处理", ("无", "评论情感分析","关键词提取"))
text(e)
keywords(e)
if data ==2:
if st.sidebar.button("退出登录"):
with open("t.txt", "w") as f:
f.write("0")
if data == 0:
st.title("京东爬虫分析系统交互界面")
image = Image.open(address+"/"+"京东.png")
st.image(image, use_column_width=True)
data = login(data)
print(data)
if data == 1:
with open("t.txt", "w") as f:
f.write("1")
with open("food.txt", "w") as f:
f.write("1")
if data ==2:
with open("t.txt", "w") as f:
f.write("2")
with open("food.txt", "w") as f:
f.write("1")