根据词表分类,统计文档关键词分类汇总
还是根据之前微博的例子,继续来说,对于爬取下来的评论,进行多维度的分析,首先我们人工构建一个词表(五个维度)【本截图仅是数据比赛使用,并无其他含义】
import xlrd
import jieba
import math
import os
def fun(path):
file_array = []
for root, dirs, files in os.walk(path):
for fn in files:
eachpath = str(root + '/' + fn)
file_array.append(eachpath)
return file_array
def read_txt(path):
f = open(path, 'r', encoding='UTF-8')
lines = f.readlines()
f.close()
print("文本文件已完成读取 path:" + path)
return lines
def read_excel(path):
book = xlrd.open_workbook(path)
sheet = book.sheets()[0]
word_dic = {}
word_dic["出行"] = sheet.col_values(0)
word_dic["经济"] = sheet.col_values(1)
word_dic["防护"] = sheet.col_values(2)
word_dic["政治"] = sheet.col_values(3)
word_dic["宣传"] = sheet.col_values(4)
return word_dic
def count(data, word_dic):
score = {"出行": 0, "经济": 0, "防护": 0, "政治": 0, "宣传": 0}
for sen in data:
seg_list = jieba.cut(sen)
for word in seg_list:
for key in word_dic.keys():
if word in word_dic[key]:
score[key] += 1
return score
def normalization(score):
sum_of_squares = 0
new_score = {}
for key in score.keys():
sum_of_squares += score[key] * score[key]
square_root = math.sqrt(sum_of_squares)
for key in score.keys():
new_score[key] = score[key] / square_root
return new_score
def main():
word_dic = read_excel("程序用词典.xlsx")
print(word_dic)
txt_list = fun("./data")
f = open("统计.txt", 'w+', encoding='utf-8')
f.write("文档名\t出行\t经济\t防护\t政治\t宣传\n")
for txt in txt_list:
data = read_txt(txt)
score = count(data, word_dic)
# score = normalization(score)
print(score)
f.write(txt + "\t")
f.write(str(score["出行"]) + "\t" + str(score["经济"]) + "\t" + str(score["防护"]))
f.write("\t" + str(score["政治"]) + "\t" + str(score["宣传"]) + "\n")
f.close()
if __name__ == '__main__':
main()
最终将所有的评论利用jieba分词,进行关键词的汇总
结果如下:
对于一个txt进行词频统计:
import codecs
import jieba
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import imageio
def get_words(txt):
#进行分词
list = jieba.cut(txt)
c = Counter()
#给分词定义条件进行筛选统计词频
for x in list:
if len(x) > 1 and x != '\r\n':
c[x] += 1
#打开一个文本将统计好的词频存放进去
with open(r'C:\Users\13284\Desktop\去除标签\词频统计\第四阶段4.8-6.17cipin.txt', 'w', encoding='gbk') as fw:
for (k, v) in c.most_common():
fw.write(k + ' ' + str(v) + '\n')
fw.close()
#绘制词云图
# pac_mask = imageio.imread(r'e:\1000.png')
# wc = WordCloud(font_path='simhei.ttf', background_color='white', max_words=2000, mask=pac_mask).fit_words(c)
# plt.imshow(wc)
#plt.axis('off')
#plt.show()
#wc.to_file('e:\\26.png')
if __name__ == '__main__':
#打开需要分词的文本
with codecs.open(r'C:\Users\13284\Desktop\去除标签\第四阶段4.8-6.17.txt', 'r', 'utf-8') as f:
txt = f.read()
get_words(txt)