#!/usr/bin/env python#-*- coding: UTF-8 -*-
importjiebafrom gensim importcorpora,models,similaritiesimportcodecsdefcut_words(file):
with open(file,'r',encoding="utf-8") as f:
text=f.read()
words=jieba.lcut(text)#print(len(words),words) #查看分词结果
returnwordsdefdrop_Disable_Words(cut_res,stopwords):
res=[]for word incut_res:if word in stopwords or word =="\n" or word =="\u3000":continueres.append(word)#print(len(res),res) #查看去停用词结果
returnresdefread_stop_word(file_path):
file=file_path
stopwords= codecs.open(file,'r',encoding='utf8').readlines()
stopwords= [ w.strip() for w instopwords ]returnstopwords#读取原始语料、停用词表
files = ['file1.txt','file2.txt','file3.txt']
stopwords= read_stop_word("stop_word.txt")#分词、去停用词
corpus =[]for file infiles:#分词
cut_res =cut_words(file)#去停用词
res =drop_Disable_Words(cut_res,stopwords)
corpus.append(res)#print(len(corpus))
#建立词袋模型
dictionary =corpora.Dictionary(corpus)
doc_vectors= [dictionary.doc2bow(text) for text incorpus]#print(len(doc_vectors),doc_vectors)######################################################################print("文档数目:")#print (dictionary.num_docs)#
#print("所有词的个数:")#print(dictionary.num_pos )#
#print("单词在文档中出现的次数:")#print(dictionary.dfs )#
#print("字典,{单词id:对应的词}")#print((dictionary.id2token))#
#print ("字典,{词:对应的单词id}")#print((dictionary.token2id))
#print ("每个文件中不重复词个数的和")#print(dictionary.num_nnz) #每个文件中不重复词个数的和##########################################################################
tfidf=models.TfidfModel(doc_vectors)
tfidf_vectors=tfidf[doc_vectors]print(len(tfidf_vectors))print(len(tfidf_vectors[0]))print(tfidf_vectors[0])#建立TF-IDF模型
defTF_IDF(tfidf_vectors,doc_vectors):
index=similarities.MatrixSimilarity(tfidf_vectors)
sims=index[doc_vectors[0]]print(list(enumerate(sims)))#建立LSI模型
defLSI(tfidf_vectors,dictionary,doc_vectors,theme_num):
lsi= models.LsiModel(tfidf_vectors, id2word=dictionary, num_topics=theme_num)
lsi_vector=lsi[tfidf_vectors]
query_lsi=lsi[doc_vectors[0]]
index=similarities.MatrixSimilarity(lsi_vector)
sims=index[query_lsi]print(list(enumerate(sims)))#使用LSI模型计算相似度
LSI(tfidf_vectors,dictionary,doc_vectors,2)#使用TF-IDF模型计算相似度
TF_IDF(tfidf_vectors,doc_vectors)