# de-tokenization
detokenized_cn_doc = [] ### ***** ###
for i in range(len(news_df)):
t = ' '.join(tokenized_cn_doc[i])
detokenized_cn_doc.append(t)
news_cn_df['token_cn_doc'] = detokenized_cn_doc
detokenized_cn_doc格式是 [‘崔宥莉 成为 了 中国女足 又 一名 强劲 的 对手’, ‘本文 为 作者 原创 未经 授权 不得 转载’]
import xlwt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
# 计算词频
count_vectorizer = CountVectorizer(min_df=1)
term_freq_matrix = count_vectorizer.fit_transform(detokenized_cn_doc)
s = count_vectorizer.vocabulary_
print('type(s)==', type(s))
Vocabulary_list = s.keys()
Vocabulary_list = list(Vocabulary_list)
out:
type(s)== <class ‘dict’>
tfidf_vectorizer = TfidfVectorizer(min_df = 1)
tfidf_matrix = tfidf_vectorizer.fit_transform(detokenized_cn_doc)
K = tfidf_matrix.todense()
print('K==', len(K))
K = np.array(K) # K 为文档-词项矩阵
KT = K.T # K 的转置,KT为词项-文档矩阵
K_dot = np.dot(KT, K) # K*K.T
print('K_dot:', len(K_dot))
out:
K== 15
K_dot: 555
# 词项-文档矩阵写入excel
workbook = xlwt.Workbook(encoding = 'ascii')
worksheet1 = workbook.add_sheet('My Worksheet1')
worksheet1.write(0, 0, 'doc编号')
for m in range(1, len(K)+1):
worksheet1.write(0, m, 'doc{}'.format(m))
for r in range(1, len(K_dot)+1):
worksheet1.write(r, 0, Vocabulary_list[r-1])
for j in range(1, len(K)+1):
for r in range(1, len(K_dot)+1):
worksheet1.write(r, j, K[j-1][r-1])
workbook.save(r'J:\2020\实验数据_结果\词项-文档矩阵_cn.xls') # 保存文件