根据词表分类，统计文档关键词分类汇总

最新推荐文章于 2024-05-09 10:00:00 发布

qq_45541881

最新推荐文章于 2024-05-09 10:00:00 发布

阅读量697

点赞数

文章标签：数据挖掘

本文链接：https://blog.csdn.net/qq_45541881/article/details/107957239

版权

根据词表分类，统计文档关键词分类汇总
还是根据之前微博的例子，继续来说，对于爬取下来的评论，进行多维度的分析，首先我们人工构建一个词表（五个维度）【本截图仅是数据比赛使用，并无其他含义】
在这里插入图片描述

import xlrd
import jieba
import math
import os


def fun(path):
    file_array = []
    for root, dirs, files in os.walk(path):
        for fn in files:
            eachpath = str(root + '/' + fn)
            file_array.append(eachpath)
    return file_array


def read_txt(path):
    f = open(path, 'r', encoding='UTF-8')
    lines = f.readlines()
    f.close()
    print("文本文件已完成读取 path:" + path)
    return lines


def read_excel(path):
    book = xlrd.open_workbook(path)
    sheet = book.sheets()[0]
    word_dic = {}
    word_dic["出行"] = sheet.col_values(0)
    word_dic["经济"] = sheet.col_values(1)
    word_dic["防护"] = sheet.col_values(2)
    word_dic["政治"] = sheet.col_values(3)
    word_dic["宣传"] = sheet.col_values(4)
    return word_dic


def count(data, word_dic):
    score = {"出行": 0, "经济": 0, "防护": 0, "政治": 0, "宣传": 0}
    for sen in data:
        seg_list = jieba.cut(sen)
        for word in seg_list:
            for key in word_dic.keys():
                if word in word_dic[key]:
                    score[key] += 1
    return score


def normalization(score):
    sum_of_squares = 0
    new_score = {}
    for key in score.keys():
        sum_of_squares += score[key] * score[key]
    square_root = math.sqrt(sum_of_squares)
    for key in score.keys():
        new_score[key] = score[key] / square_root
    return new_score


def main():
    word_dic = read_excel("程序用词典.xlsx")
    print(word_dic)
    txt_list = fun("./data")
    f = open("统计.txt", 'w+', encoding='utf-8')
    f.write("文档名\t出行\t经济\t防护\t政治\t宣传\n")
    for txt in txt_list:
        data = read_txt(txt)
        score = count(data, word_dic)
        # score = normalization(score)
        print(score)
        f.write(txt + "\t")
        f.write(str(score["出行"]) + "\t" + str(score["经济"]) + "\t" + str(score["防护"]))
        f.write("\t" + str(score["政治"]) + "\t" + str(score["宣传"]) + "\n")
    f.close()


if __name__ == '__main__':
    main()

最终将所有的评论利用jieba分词，进行关键词的汇总
结果如下：
在这里插入图片描述
对于一个txt进行词频统计：

import codecs
import jieba
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import imageio

def get_words(txt):
    #进行分词
    list = jieba.cut(txt)
    c = Counter()

    #给分词定义条件进行筛选统计词频
    for x in list:
        if len(x) > 1 and x != '\r\n':
            c[x] += 1

    #打开一个文本将统计好的词频存放进去
    with open(r'C:\Users\13284\Desktop\去除标签\词频统计\第四阶段4.8-6.17cipin.txt', 'w', encoding='gbk') as fw:
        for (k, v) in c.most_common():
            fw.write(k + ' ' + str(v) + '\n')
        fw.close()

    #绘制词云图
 #   pac_mask = imageio.imread(r'e:\1000.png')
  #  wc = WordCloud(font_path='simhei.ttf', background_color='white', max_words=2000, mask=pac_mask).fit_words(c)
   # plt.imshow(wc)
    #plt.axis('off')
    #plt.show()
    #wc.to_file('e:\\26.png')

if __name__ == '__main__':
    #打开需要分词的文本
    with codecs.open(r'C:\Users\13284\Desktop\去除标签\第四阶段4.8-6.17.txt', 'r', 'utf-8') as f:
        txt = f.read()
    get_words(txt)