参考http://www.52nlp.cn/如何计算两个文档的相似度三
#encoding=utf-8
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
courses=[line.strip() for line in file('coursera_corpus')]
courses_name=[course.split('\t')[0] for course in courses]
texts_tokenized=[[word.lower()for word in word_tokenize(document.decode('utf-8'))]
for document in courses]
#用nltk中的停用词列表滤除课程语料中的停用词
english_stopwords=stopwords.words('english')
texts_filtered_stopwords=[[word for word in document if word not in english_stopwords]
for document in texts_tokenized]
#过滤标点符号
english_punctuations=[',','.',':','?','(',')','[',']','&','!','*','@','#','$','%']
texts_filted=[[word for word in document if word not in english_punctuations]
for document in texts_filtered_stopwords]
#提取词干
st=LancasterStemmer()
texts_stemmed=[[st.stem(word) for word in document]
for document in texts_filted]
#去掉只出现一次的词汇
from collections import defaultdict
frequency=defaultdict(int)
for text in texts_stemmed:
for token in text:
frequency[token]+=1
texts=[[token for token in text if frequency[token]>1]
for text in texts_stemmed]
from gensim import corpora, models, similarities
import logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
dictionary=corpora.Dictionary(texts)
corpus=[dictionary.doc2bow(text) for text in texts]
tfidf=models.TfidfModel(corpus)
corpus_tfidf=tfidf[corpus]
lsi=models.LsiModel(corpus_tfidf,id2word=dictionary,num_topics=10)
index=similarities.MatrixSimilarity(lsi[corpus])
print "the query course is:",courses_name[174]
ml_course=texts[174]
ml_bow=dictionary.doc2bow(ml_course)
ml_lsi=lsi[ml_bow]
sims=index[ml_lsi]
sort_sims=sorted(enumerate(sims),key=lambda item:-item[1])
courses_nameTop=[tup[0]for tup in sort_sims[0:10]]
courses_sim=[courses_name[num] for num in courses_nameTop]
print "the similarity courses are:"
for doc in courses_sim:
print doc