根据词表分类,统计文档关键词分类汇总

根据词表分类,统计文档关键词分类汇总
还是根据之前微博的例子,继续来说,对于爬取下来的评论,进行多维度的分析,首先我们人工构建一个词表(五个维度)【本截图仅是数据比赛使用,并无其他含义】
在这里插入图片描述

import xlrd
import jieba
import math
import os


def fun(path):
    file_array = []
    for root, dirs, files in os.walk(path):
        for fn in files:
            eachpath = str(root + '/' + fn)
            file_array.append(eachpath)
    return file_array


def read_txt(path):
    f = open(path, 'r', encoding='UTF-8')
    lines = f.readlines()
    f.close()
    print("文本文件已完成读取 path:" + path)
    return lines


def read_excel(path):
    book = xlrd.open_workbook(path)
    sheet = book.sheets()[0]
    word_dic = {}
    word_dic["出行"] = sheet.col_values(0)
    word_dic["经济"] = sheet.col_values(1)
    word_dic["防护"] = sheet.col_values(2)
    word_dic["政治"] = sheet.col_values(3)
    word_dic["宣传"] = sheet.col_values(4)
    return word_dic


def count(data, word_dic):
    score = {"出行": 0, "经济": 0, "防护": 0, "政治": 0, "宣传": 0}
    for sen in data:
        seg_list = jieba.cut(sen)
        for word in seg_list:
            for key in word_dic.keys():
                if word in word_dic[key]:
                    score[key] += 1
    return score


def normalization(score):
    sum_of_squares = 0
    new_score = {}
    for key in score.keys():
        sum_of_squares += score[key] * score[key]
    square_root = math.sqrt(sum_of_squares)
    for key in score.keys():
        new_score[key] = score[key] / square_root
    return new_score


def main():
    word_dic = read_excel("程序用词典.xlsx")
    print(word_dic)
    txt_list = fun("./data")
    f = open("统计.txt", 'w+', encoding='utf-8')
    f.write("文档名\t出行\t经济\t防护\t政治\t宣传\n")
    for txt in txt_list:
        data = read_txt(txt)
        score = count(data, word_dic)
        # score = normalization(score)
        print(score)
        f.write(txt + "\t")
        f.write(str(score["出行"]) + "\t" + str(score["经济"]) + "\t" + str(score["防护"]))
        f.write("\t" + str(score["政治"]) + "\t" + str(score["宣传"]) + "\n")
    f.close()


if __name__ == '__main__':
    main()

最终将所有的评论利用jieba分词,进行关键词的汇总
结果如下:
在这里插入图片描述
对于一个txt进行词频统计:

import codecs
import jieba
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import imageio

def get_words(txt):
    #进行分词
    list = jieba.cut(txt)
    c = Counter()

    #给分词定义条件进行筛选统计词频
    for x in list:
        if len(x) > 1 and x != '\r\n':
            c[x] += 1

    #打开一个文本将统计好的词频存放进去
    with open(r'C:\Users\13284\Desktop\去除标签\词频统计\第四阶段4.8-6.17cipin.txt', 'w', encoding='gbk') as fw:
        for (k, v) in c.most_common():
            fw.write(k + ' ' + str(v) + '\n')
        fw.close()

    #绘制词云图
 #   pac_mask = imageio.imread(r'e:\1000.png')
  #  wc = WordCloud(font_path='simhei.ttf', background_color='white', max_words=2000, mask=pac_mask).fit_words(c)
   # plt.imshow(wc)
    #plt.axis('off')
    #plt.show()
    #wc.to_file('e:\\26.png')

if __name__ == '__main__':
    #打开需要分词的文本
    with codecs.open(r'C:\Users\13284\Desktop\去除标签\第四阶段4.8-6.17.txt', 'r', 'utf-8') as f:
        txt = f.read()
    get_words(txt)
  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值