# -*- coding:utf-8 -*
#本代码是在jupyter notebook上实现,author:huzhifei, create time:2018/8/14
#本脚本主要实现了基于python的gensim包里的similarities接口对文本做相似度的项目目的
#导入gensim与jieba包
from gensim import corpora, models, similarities
import jieba
#去除中英停用词
def get_custom_stopwords(stop_words_file):
with open(stop_words_file,encoding='utf-8')as f:
stopwords=f.read()
stopwords_list=stopwords.split('\n')
custom_stopwords_list=[i for i in stopwords_list]
return custom_stopwords_list
#调用停用词函数
stop_words_file="stopwordsHIT.txt"
stopwords=get_custom_stopwords(stop_words_file)
print(len(stopwords))
#jieba分词函数
def cut(sentence):
generator = jieba.cut(sentence)
return [word for word in generator if word not in stopwords]
#连接数据库
i