import numpy as np
from scipy.linalg import norm
from scipy import stats
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
def jaccard_similarity(s1, s2):
vectorizer = CountVectorizer(tokenizer=lambda s: s.split())
corpus = [s1, s2]
vectors = vectorizer.fit_transform(corpus).toarray()
numerator = np.sum(np.min(vectors, axis=0))
denominator = np.sum(np.max(vectors, axis=0))
return 1.0 * numerator / denominator
def cosine_similarity_tf(s1, s2):
vectorizer = CountVectorizer(tokenizer=lambda s: s.split())
corpus = [s1, s2]
vectors = vectorizer.fit_transform(corpus).toarray()
return np.dot(vectors[0], vectors[1]) / (norm(vectors[0]) * norm(vectors[1]))
def cosine_similarity_tfidf(s1, s2):
vectorizer = TfidfVectorizer(tokenizer=lambda s: s.split())
corpus = [s1, s2]
vectors = vectorizer.fit_transform(corpus).toarray()
# print(vectors[0],vectors[1])
return np.dot(vectors[0], vectors[1]) / (norm(vectors[0]) * norm(vectors[1]))
params = ['新 疆 生 产 建 设 兵 团 阿 克 苏 垦 区 人 民 检 察 院', '新 疆 产 建 设 兵 团 阿 克 苏 垦 区 ']
print(jaccard_similarity(*params))
print(cosine_similarity_tf(*params))
print(cosine_similarity_tfidf(*params))