余弦相似度-python实现

参考知乎上一二三冲鸭同学的博客,根据自己理解对他的方法再实现一遍~

"""
计算两句话的余弦相似度

源文: 使用余弦相似度算法计算文本相似度 - 一二三冲鸭的文章 - 知乎
https://zhuanlan.zhihu.com/p/43396514
"""
import math
from typing import List, Dict, AnyStr

import jieba


def separate(text_data: AnyStr) -> List:
    ret = jieba.cut(text_data, cut_all=False)
    return [word for word in ret if word not in [',', '。']]


def text_to_num(all_data: Dict, sep: List) -> List:
    for index in range(0, len(sep)):
        sep[index] = all_data[sep[index]]
    return sep


def summary(sep1: List, sep2: List) -> Dict:
    all_data = {}
    data = list(set(sep1 + sep2))
    for index in range(0, len(data)):
        all_data[data[index]] = index
    return all_data


def to_feq(all_data: Dict, sep: List) -> List:
    temp = [0 for i in range(0, len(all_data))]
    for num in sep:
        temp[num] += 1
    return temp


def compute_cosine_similarity(params1: List, params2: List) -> float:
    # 计算余弦相似度
    denominator_l = 0
    denominator_r = 0
    molecular = 0
    for index in range(0, len(params1)):
        molecular += (params1[index] * params2[index])
        denominator_l += params1[index] ** 2
        denominator_r += params2[index] ** 2
    return molecular / (math.sqrt(denominator_l) * math.sqrt(denominator_r))


def run():
    text1 = "飓风跳跃星球"
    text2 = "飓风跳过星球"
    seg_list1: List = separate(text1)
    print(f'seg_list1分词: ' + '/'.join(seg_list1))
    seg_list2: List = separate(text2)
    print(f'seg_list2分词: ' + '/'.join(seg_list2))

    all_data: Dict = summary(seg_list1, seg_list2)
    print(f'all_list: {all_data}')
    seg_list1: List = text_to_num(all_data, seg_list1)
    print(f'seg_list1转为数字: {seg_list1}')
    seg_list2: List = text_to_num(all_data, seg_list2)
    print(f'seg_list2转为数字: {seg_list2}')

    seg_list1: List = to_feq(all_data, seg_list1)
    print(f'seg_list1词频统计: {seg_list1}')
    seg_list2: List = to_feq(all_data, seg_list2)
    print(f'seg_list2词频统计: {seg_list2}')
    ret: float = compute_cosine_similarity(seg_list1, seg_list2)
    print(f'余弦相似度为: {ret}')


if __name__ == '__main__':
    run()


参考博客:
使用余弦相似度算法计算文本相似度 - 一二三冲鸭的文章 - 知乎
https://zhuanlan.zhihu.com/p/43396514

评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值