Tfidf计算
- 主要内容
读取全部评论与词频构成的xls文档
读取对评论初步分词后的文档和停用词txt文档
将每条评论的词汇与词频输出为字典形式,同时每条评论整体输出为一个列表元素(方便后续构造稀疏矩阵)
计算各词语的tf-idf值并输出到xls文档
- 具体实现
读取评论文档,并分词、统计词频
将分词后的词语与频数输出为字典格式
import xlrd
import thulac
import xlwt
import numpy as np
from sklearn.feature_extraction import DictVectorizer
data = xlrd.open_workbook('E:\\wholeComm_wordCount_book1_570.xls')
table = data.sheet_by_name('wholeComm_wordCount_book1_570')
table1 = table.col_values(0)
# 对评论文档进行分词
thu1 = thulac.thulac(seg_only = True)
thu1.cut_f("E:\\wholeComm_book01_570.txt", "E:\\wholeComm_book01_570cutput2.txt")
file = open("E:\\wholeComm_book01_570cutput2.txt", 'r') # 待统计文档的路径
txt = file.readlines() # 一次读取整个文本文件,将文本文件以行为单位处理为一个关于行的列表
file2 = open("E:\\Octopus\\task01_book\\StopWords.txt", 'r', encoding='utf-8') # 待统计文档的路径
stopwords = file2.readlines() # 获取停用词
# 将每条评论的词汇与词频输出为字典形式,同时每条评论整体输出为一个列表元素
vector = []
for line in txt:
txtline = str(line)
lineword = txtline.split(' ') # 将文本按照空格切分,获得列表
count = 0
dict1 = {}
for word in lineword[:-1]:
if word in stopwords: # 不统计停用词
pass
else:
if word in dict1:
count += 1
dict1[word] = count
else:
# print("A word lost at sum.txt:", word)
count = 1
dict1[word] = count
vector.append(dict1)
file.close()
file2.close()
计算tf-idf值
# CountVectorizer函数,属于常见的特征数值计算类,是一个文本特征提取方法。
# 对于每一个训练文本,它只考虑每种词汇在该训练文本中出现的频率,将文本中的词语转换为词频矩阵。
vecorizer = DictVectorizer(sparse=False) # 设置sparse=False获得numpy ndarray形式的结果
matrix = vecorizer.fit_transform(vector) # 对字典列表vector进行转换,转换成特征矩阵(分词后,所有词汇形成的稀疏矩阵)
sumword = [sum(i) for i in zip(*matrix)] # 计算一列的总值:各词在全部评论中出现的总次数,矩阵长度为词的类别数
transpose = list(map(list, zip(*matrix))) # 将矩阵转置,用以统计含有特定词的评论数
sentence = [len(i)-i.count(0) for i in transpose] # 含有对应词语的评论数
tf = [round(i/sum(sumword),4) for i in sumword]
idf = [round(np.log(len(matrix)/(i+1)),4) for i in sentence]
tfidf = [round(tf*idf,4) for tf,idf in zip(tf,idf)]
sort_tfidf = list(np.sort(tfidf)) # 将各词语的tfidf数值按照升序排列
index = list(np.argsort(sentence)) # 升序排列的词在原列表中的索引
rank = [[sort_tfidf[-i] for i in range(1,81)], [index[-i] for i in range(1,81)]] # 输出前81个tfidf值和其在原序列中的索引
wordname = vecorizer.get_feature_names() # 获取字典的键:文本中的词
feature = [wordname[i] for i in rank[1]] # 获取tfidf值排在前81的词
print('wordname:', wordname)
print('feature:', feature)
wbk3 = xlwt.Workbook(encoding='utf8')
sheet = wbk3.add_sheet("nodisComm_wordCount_book1_570")
for i in range(len(feature)):
sheet.write(i, 1, label=tfidf[i])
sheet.write(i, 0, label=feature[i])
wbk3.save('E:\\wholeComm_word_80tfidf.xls')