# -*- coding:utf-8 -*- import jieba from gensim import corpora,models,similarities doc0 = '我爱他' doc1 = '我不知道他爱不爱我' doc2 = '他爱我但是我不知道' doc3 = '谁管他爱谁谁' doc_test= '我觉得我爱他' print doc0 all_doc = [] all_doc.append(doc0) all_doc.append(doc1) all_doc.append(doc2) all_doc.append(doc3) # all_doc.append(doc4) # all_doc.append(doc5) # all_doc.append(doc6) # all_doc.append(doc7) all_doc_list = [] for doc in all_doc: doc_list = [word for word in jieba.cut(doc)] all_doc_list.append(doc_list) print(all_doc_list) doc_test_list = [word for word in jieba.cut(doc_test)] dictionary = corpora.Dictionary(all_doc_list) dictionary.keys() # dictionary.token2id corpus = [dictionary.doc2bow(doc) for doc in all_doc_list] doc_test_vec = dictionary.doc2bow(doc_test_list) # doc_test_vec tfidf = models.TfidfModel(corpus) # tfidf[doc_test_vec] index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary.keys())) sim = index[tfidf[doc_test_vec]] print sim
可用的文本相似度
最新推荐文章于 2024-01-09 01:18:06 发布