def normalized_levenshtein(str_a, str_b):
'''
Edit distance normalized to [0, 1].
'''
return min(editdistance.eval(str_a, str_b) / (len(str_b) + 1e-16), 1.0)
def jaccard_set(set_a, set_b):
'''
Jaccard SIMILARITY between sets.
'''
set_c = set_a.intersection(set_b)
return float(len(set_c)) / (len(set_a) + len(set_b) - len(set_c) + 1e-16)
词向量操作
from .measure import normalized_levenshtein, jaccard_word, jaccard_char
from gensim.models import KeyedVectors
import numpy as np
EMBEDDING_PATH = 'distance_module/zh.300.vec.gz'
EMBEDDING_DIM = 300
DEFAULT_KEYVEC = KeyedVectors.load_word2vec_format(EMBEDDING_PATH, limit=50000)
def tokenize(text):
import jieba
return ' '.join(jieba.cut(text))
def doc2vec(tokenized):
tokens = tokenized.split(' ')
vec = np.full(EMBEDDING_DIM, 1e-10)
weight = 1e-8
for _token in tokens:
try:
vec += DEFAULT_KEYVEC.get_vector(_token)
weight += 1.0
except:
pass
return vec / weight
def batch_doc2vec(list_of_tokenized_text):
return [doc2vec(_text) for _text in list_of_tokenized_text]
def batch_embedding_cosine_distance(self, text_list_a, text_list_b):
'''
Compute embedding cosine distances in batches.
'''
import numpy as np
embedding_array_a = np.array(batch_doc2vec(text_list_a))
embedding_array_b = np.array(batch_doc2vec(text_list_b))
norm_a = np.linalg.norm(embedding_array_a, axis=1)
norm_b = np.linalg.norm(embedding_array_b, axis=1)
cosine_numer = np.multiply(embedding_array_a, embedding_array_b).sum(axis=1)
cosine_denom = np.multiply(norm_a, norm_b)
cosine_dist = 1.0 - np.divide(cosine_numer, cosine_denom)
return cosine_dist.tolist()