A项目之五:关于标题文本的分析

绘制词云

准备相关库

import jieba
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from PIL import Image  # 图片处理
import numpy as np
import pymysql
from sqlalchemy import create_engine

准备准备词云所需的文字

con = create_engine('mysql+pymysql://root:1356130369@localhost:3306/dt', encoding='utf8')
sql = "select 标题 from lips "
result = con.execute(sql)
result = result.fetchall()
text=""
for item in result:
    text=text+item[0]

开始分词

cut=jieba.cut(text)
string=" ".join(cut)

准备图片,开始画词云

img=Image.open(r'C:\Users\zzz\Desktop\数据可视化部分前端开发素材\前端素材\tree.jpg')
img_array=np.array(img) # 将图片转换为数组
wc=WordCloud(
    background_color='white',
    mask=img_array,
    font_path="simsun.ttc" # 字体位置: C:\Windows\Fonts

)
wc.generate_from_text(string)

输出词云图片到文件

plt.imshow(wc) # 按照词云规则显示出来
plt.axis('off') # 是否显示坐标轴
plt.savefig(r'C:\Users\zzz\Desktop\tree.jpg',dpi=500)
关于词的分析

先利用python处理文本数据

import jieba        #分词
import numpy as np
import pandas as pd
import pymysql
from sqlalchemy import create_engine
data=pd.read_excel('./口红.xlsx')
text=list(data['标题'].values)

# 将大list里面的小list分词
alldata=[]
for i in text:
    i=jieba.lcut(i)
    alldata.append(i)



# 剔除停用词
stop_word=[' ','/','【','】','色']
clean_alldata=[]
for item in alldata:
    clean_data=[]
    for i in item:
        if i not in stop_word:
            clean_data.append(i)
    clean_alldata.append(clean_data)


# 进行去重(所以 为了准确性 这里对过滤后的数据大list中的每个小list的元素进行去重,
# 即每个标题被分割后的词语唯一。)
alldata_clean=[]
for item in clean_alldata:
    temp=[]
    for i in item:
        if i not in temp:
            temp.append(i)
    alldata_clean.append(temp)



# 转换为一个list
dfdata=[]
for item in alldata_clean:
    for i in item:
        dfdata.append(i)



# 将list转换为dataframe
df=pd.DataFrame(dfdata)
df.columns=['allword']


# 进行分类汇总
count_df=df.value_counts().reset_index()
count_df.columns=['word','count']

# 对前15个高频词每个高频词的销售量进行统计
sum_number=[]
for item in count_df['word'][0:15]:
    sale_list=[]
    i=0
    for w in alldata_clean: # 对于每一个小list
        if item in w:
            sale_list.append(data['付款人数'][i])
        i = +1
    sum_number.append(sum(sale_list))

xiaoliang=pd.DataFrame(sum_number)

gaopinci=pd.DataFrame(count_df['word'][0:15].values)

df_sum=pd.concat([xiaoliang,gaopinci],axis=1,ignore_index=True)
df_sum.columns=['xiaoliang','gaopinci']

# 建立连接,username替换为用户名,passwd替换为密码,test替换为数据库名
conn = create_engine('mysql+pymysql://root:1356130369@localhost:3306/dt',encoding='utf8')
# 写入数据,table_name为表名,‘replace’表示如果同名表存在就替换掉
pd.io.sql.to_sql(df_sum, "gpcxlqk", conn, if_exists='replace')


# 进行测试
xiaoliang = []
gaopinci = []
con = create_engine('mysql+pymysql://root:1356130369@localhost:3306/dt', encoding='utf8')
sql = "select xiaoliang,gaopinci from gpcxlqk "
result = con.execute(sql)
result = result.fetchall()
for item in result:
    xiaoliang.append(item[0])
    gaopinci.append(item[1])
renliang = [i * 5 for i in range(1, 16)]
alldata=[]


for i in range(15):
    list = []
    list.append(renliang[i])
    list.append(gaopinci[i])
    list.append(xiaoliang[i])
    alldata.append(list)

print(alldata)
print(gaopinci)
print(xiaoliang)

搭建Flask框架,使得图形能在网页显示

@app.route('/')
def gaopinci():
    xiaoliang = []
    gaopinci = []
    con = create_engine('mysql+pymysql://root:1356130369@localhost:3306/dt', encoding='utf8')
    sql = "select xiaoliang,gaopinci from gpcxlqk "
    result = con.execute(sql)
    result = result.fetchall()
    for item in result:
        xiaoliang.append(item[0])
        gaopinci.append(item[1])
    renliang=[i*5 for i in [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]]

    return render_template('高频词销量情况.html',renliang=renliang,xiaoliang=xiaoliang,gaopinci=gaopinci)

书写名字为高频词销量情况.html 文件

<!DOCTYPE html>
<html>
<head>
    <meta charset="utf-8">
    <title>ECharts</title>
    <!-- 引入 echarts.js -->
    <script src="../static/echarts.min.js"></script>
</head>
<body>
    <!-- 为ECharts准备一个具备大小(宽高)的Dom -->
    <div id="main" style="width: 80%;height:600px;"></div>
    <script type="text/javascript">
        // 基于准备好的dom,初始化echarts实例
        var myChart = echarts.init(document.getElementById('main'));

        // 指定图表的配置项和数据
        var option = {
    dataset: {
            source:[
            ['score', 'amount', 'product'],
            {% for i in range(15) %}
                [{{ renliang[i] }},{{ xiaoliang[i] }},'{{ gaopinci[i] }}'],
        {% endfor %}
        ]
    },
    grid: {containLabel: true},
    xAxis: {name: 'amount'},
    yAxis: {type: 'category'},
    visualMap: {
        orient: 'horizontal',
        left: 'center',
        min: 10,
        max: 100,
        text: ['High Score', 'Low Score'],
        // Map the score column to color
        dimension: 0,
        inRange: {
            color: ['#d7da8b', '#e15457']
        }
    },
    series: [
        {
            type: 'bar',
            encode: {
                // Map the "amount" column to X axis.
                x: 'amount',
                // Map the "product" column to Y axis
                y: 'product'
            }
        }
    ]
};


        // 使用刚指定的配置项和数据显示图表。
        myChart.setOption(option);
    </script>
</body>
</html>
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值