词嵌入操作

def normalized_levenshtein(str_a, str_b):
    '''
    Edit distance normalized to [0, 1].
    '''
    return min(editdistance.eval(str_a, str_b) / (len(str_b) + 1e-16), 1.0)

def jaccard_set(set_a, set_b):
    '''
    Jaccard SIMILARITY between sets.
    '''
    set_c = set_a.intersection(set_b)
    return float(len(set_c)) / (len(set_a) + len(set_b) - len(set_c) + 1e-16)

词向量操作

from .measure import normalized_levenshtein, jaccard_word, jaccard_char
from gensim.models import KeyedVectors
import numpy as np

EMBEDDING_PATH = 'distance_module/zh.300.vec.gz'
EMBEDDING_DIM = 300
DEFAULT_KEYVEC = KeyedVectors.load_word2vec_format(EMBEDDING_PATH, limit=50000)

def tokenize(text):
    import jieba
    return ' '.join(jieba.cut(text))

def doc2vec(tokenized):
    tokens = tokenized.split(' ')
    vec = np.full(EMBEDDING_DIM, 1e-10)
    weight = 1e-8
    for _token in tokens:
        try:
            vec += DEFAULT_KEYVEC.get_vector(_token)
            weight += 1.0
        except:
            pass
    return vec / weight
    
def batch_doc2vec(list_of_tokenized_text):
    return [doc2vec(_text) for _text in list_of_tokenized_text]

def batch_embedding_cosine_distance(self, text_list_a, text_list_b):
        '''
        Compute embedding cosine distances in batches.
        '''
        import numpy as np
        embedding_array_a = np.array(batch_doc2vec(text_list_a))
        embedding_array_b = np.array(batch_doc2vec(text_list_b))
        norm_a = np.linalg.norm(embedding_array_a, axis=1)
        norm_b = np.linalg.norm(embedding_array_b, axis=1)
        cosine_numer = np.multiply(embedding_array_a, embedding_array_b).sum(axis=1)
        cosine_denom = np.multiply(norm_a, norm_b)
        cosine_dist = 1.0 - np.divide(cosine_numer, cosine_denom)
        return cosine_dist.tolist()
    
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值