语义相似度就是计算两个句子之间的相似度,可以将两个句子向量化之后,计算余弦距离。
# -*- coding: utf-8 -*-
"""
Created on Thu Feb 21 20:18:38 2019
@author: lcl
"""
from sklearn.feature_extraction.text import CountVectorizer
import math
import jieba
from setting import logger
#创建停用词list
def stop_word_list(path):
stopwords = [line.strip() for line in open(path, 'r', encoding='utf-8').readlines()]
return stopwords
#预处理文本
def preprocess(text):
if isinstance(text,str):
text_with_spaces=""
textcut = jieba.cut(text.strip())
stopwords = stop_word_list("data/stop_words.txt")
for word in textcut:
if word not in stopwords:
if word != '\t':
text_with_spaces += word + " "
else:
raise TypeError('text should be str')
return text_with_spaces
def norm_vector_nonzero(ori_vec):
ori