相似度与相异度章节头歌实训_头歌用smc相似度计算文本之间的相似度-CSDN博客

本文链接：https://blog.csdn.net/tgmhh/article/details/136646772

本文介绍了欧几里得距离、余弦相似度、泊松相关系数以及广义Jaccard、Cosine和SMC等方法在计算文本相似度和用户兴趣匹配上的应用，以支持推荐系统的个性化推荐策略。

摘要由CSDN通过智能技术生成

一、相似度与相异度

第一关

# 欧几里得相关系数
def euclidean(p, q):
    # 如果两数据集数目不同，计算两者之间都对应有的数
    same = 0
    for i in p:
        if i in q:
            same += 1

    # 计算欧几里德距离,并将其标准化
    ########## Begin ##########
    e = sum([(p[j] - q[j]) ** 2 for j in range(same)]) 
    ########## End ##########    
    return 1 / (1 + e ** 0.5)


print("欧几里得计算出的相似度为",euclidean([1, 2, 3, 4, 5], [2, 4, 6, 8, 10]))

# 余弦相似度
def cosine_similarity(x, y):
    xx = 0.0
    yy = 0.0
    xy = 0.0
    for i in range(len(x)):
        xx += x[i] * x[i]
        yy += y[i] * y[i]
        xy += x[i] * y[i]
    xx_sqrt = xx ** 0.5
#     print(xx)
#     print(yy)
#     print(xy)
    yy_sqrt = yy ** 0.5
    cos = xy/(xx_sqrt*yy_sqrt)
    return cos


print('余弦相关系数计算出的相似度为',cosine_similarity([5,3],[5,8]))


#泊松相关系数
import scipy
import numpy
from scipy.stats import pearsonr

########## Begin ##########

########## End ##########
y =numpy.array([1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,3])
x =numpy.array([0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,2,2,2,1])
r_row, p_value = pearsonr(x, y)
r_row=0.44908871313907184
print ("用户(UID)84001033与用户(UID)84001003从2004/1/20到4/20/20这段日期的相似度为",r_row)
# print (p_value)

第二关

# （一）：找到与目标用户兴趣相似的用户集合
# ########## Begin ##########
# 目标用户(A用户喜欢a、b、d商品)
target_user = {'A':['a','b','d']}
print(f'目标用户：{target_user}')
# 相似用户用户()
alike_user = {'B': ['a','c'],'C': ['b','e'],'D':['c','d','e']}
print(f'相似用户：{alike_user}')
# ########## End ##########
# 倒排表

"""
a A B C
"""
# 总共商品类型
key_value = []
value1 = target_user.values()

for item in value1:
    for good in item:
        # 如果不再就添加到键值
        if good not in key_value:
            key_value.append(good)
value2 = alike_user.values()
########## Begin ##########
for item in value2:
   for good in item:
       if good not in key_value:
           key_value.append(good)
print(f'总共商品类型：{key_value}')
########## End ##########
new_table = []
for good in key_value:#遍历所有物品
    new_dict = {}
    user_list = []
    # 目标用户
    key_value_list = target_user.items()
    #print("key_value_list",key_value_list)
    for key_value in key_value_list:
        key = key_value[0]#用户名
        value = key_value[1]#物品名
        if (good in value) & (key not in user_list):
            user_list.append(key)
    # new_dict[good] = user_list
    # new_table.append(new_dict)
    # 相似用户
    key_value_list = alike_user.items()
    # print(key_value_list)
    for key_value in key_value_list:
        key = key_value[0]
        value = key_value[1]
        if (good in value) & (key not in user_list):
            user_list.append(key)
    new_dict[good] = user_list
    new_table.append(new_dict)
print(new_table)
########## Begin ##########
# 计算余弦相似度
import pandas as pd
import numpy as np
df = pd.DataFrame(data=np.zeros((4,4)), columns=['A','B','C','D'],index=['A','B','C','D'])
print(df)

# 统计交集
for item in new_table:
    print(list(item.values())[0])
    label = list(item.values())[0]
    x = label[0]
    y = label[1]
    df.loc[x,y] = df.loc[x,y] + 1
    df.loc[y,x] = df.loc[y,x] + 1
print(df)
########## End ##########
# 计算两两之间的相似度
count_list = {}
for i in ['A','B','C','D']:
    count = df.loc[i,:].sum()
    count_list[i] = count
print(count_list)
# 计算余弦相似度
########## Begin ##########
for i in ['A','B','C','D']:
    for j in ['A', 'B', 'C', 'D']:
        df.loc[i,j] = df.loc[i,j] / np.sqrt(count_list[i] * count_list[j])
print(df)
########## End ##########

########## Begin ##########
# 计算p(A,c)和p(A,e)
p_Ac = df.loc['A','B'] + df.loc['A','D']
print(f'p(A,c):{p_Ac}')
p_Ae = df.loc['A','C'] + df.loc['A','D']
print(f'p(A,e):{p_Ae}')

########## End ##########
if p_Ac > p_Ae:
    print("用户A对c商品更感兴趣，将c商品推荐给A")
elif p_Ac < p_Ae:
    print("用户A对e商品更感兴趣，将e商品推荐给A")
else:
    print("用户A对c商品和e商品同样感兴趣！")

二、用广义Jaccard系数计算相似度

import numpy as np
###-----------编写代码实现广义Jaccard系数的计算----------
def sim_tonimoto(user1, user2):
    common = 0

    #1.判断有没有相同的数据, 没有相同数据则返回0
    ################# Begin #################
    for i in range(len(user1)):
        if(user1[i]==user2[2]):
            common+=1

    ################# End #################
    if common == 0:
        return 0

    common_num = common
    user1_num = len(user1)
    user2_num = len(user2)


    #2.广义Jaccard系数计算
    ################# Begin #################
    x=user1
    y=user2

    
    intersection = set(x).intersection(set(y))
    if not intersection:
        return 0.0

    # 计算并集大小
    union = set(x).union(set(y))

    # 根据定义计算广义Jaccard相似度
    similarity = len(intersection) / len(union)
    return similarity

    ################# End #################

    return res

a = [10,20,30] #输入数据1
b = [10,0,30] #输入数据2

#3.调用自己实现的sim_tonimoto函数
################# Begin #################

result=sim_tonimoto(a, b)

################# End #################
print(result) #输出相似度

三、用Cosine计算相似度


import numpy as np
import jieba
jieba.setLogLevel(jieba.logging.INFO)

def cosine_similarity(sentence1: str, sentence2: str) -> float:

    #1.实现文本分词
    ########## Begin ##########
    seg1 = [word for word in jieba.cut(sentence1)]
    seg2 = [word for word in jieba.cut(sentence2)]


    ########## End ##########

    #2.建立词库
    ########## Begin ##########
    word_list = list(set([word for word in seg1 + seg2]))#建立词库


    ########## End ##########

    word_count_vec_1 = []
    word_count_vec_2 = []
    for word in word_list:

        #3.统计各个文本在词典里出现词的次数
        ########## Begin ##########
        word_count_vec_1.append(seg1.count(word))
        word_count_vec_2.append(seg2.count(word))
    

        ########## End ##########
    #print(word_list)
    vec_1 = np.array(word_count_vec_1)
    vec_2 = np.array(word_count_vec_2)
    #print(vec_1)
    #print(vec_2)
    #4.余弦公式
    ########## Begin ##########
    x1=vec_1.dot(vec_2)
    x2=np.linalg.norm(vec_1)*np.linalg.norm(vec_2)
    #for i in range(0,len(word_list)):
    #    x2*=np.sqrt(vec_1[i]**2+vec_2[i]**2)
    cos=x1/x2

    ########## End ##########

    return cos


str1="湖南是一个好地方"
str2="湖南好吃的在哪里"

sim1=cosine_similarity(str1,str2)

print(sim1)

四、用Jaccard系数计算文本之间的相似度

#import numpy as np
#from scipy.spatial.distance import pdist#直接调包可以计算JC值 ,需要两个句子长度一样；
import jieba
jieba.setLogLevel(jieba.logging.INFO)

def Jaccrad(model, reference):#terms_reference为源句子，terms_model为候选句子

    #1.分词
    ########## Begin ##########
    terms_reference=jieba.cut(reference)
    terms_model=jieba.cut(model)


    ########## End ##########

    grams_reference = list(set(terms_reference))
    grams_model = list(set(terms_model))

    #2.计算交集
    ########## Begin ##########
    temp=0
    for i in grams_reference:
        if i in grams_model:
            temp=temp+1 

    ########## End ##########

    fenmu=len(grams_model)+len(grams_reference)-temp #并集

    #3.计算Jaccard系数
    ########## Begin ##########
    jaccard_coefficient=temp/fenmu


    ########## End ##########

    return jaccard_coefficient


str1="我爱北京天安门"
str2="天安门雄伟壮阔让人不得不爱"
jaccard_coefficient=Jaccrad(str1,str2)
print(jaccard_coefficient)

五、用SMC相似度计算文本之间的相似度

import numpy as np  
import jieba  
jieba.setLogLevel(jieba.logging.INFO)  
  
def smc_similarity(sentence1: str, sentence2: str) -> float:  
    # 1. 实现文本分词  
    ########## Begin ##########
    seg1 =[word for word in jieba.cut(sentence1)] 
    seg2 =[word for word in jieba.cut(sentence2)]

    ########## End ##########

    # 2. 建立词库  
    ########## Begin ##########
    word_list = list(set([word for word in seg1 + seg2]))#建立词库

    ########## End ##########

    # 3. 统计各个文本在词典里出现词的次数  
    ########## Begin ##########
    word_counts_1 = [seg1.count(word) for word in word_list]
    word_counts_2 = [seg2.count(word) for word in word_list]

    ########## End ##########

    # 4. 余弦公式  
    ########## Begin ##########
    f00=0
    f01 =0 
    f10 = 0
    f11 = 0
    for i in range(0,len(word_counts_1)):
        if word_counts_1[i]==0 and word_counts_2[i]==0:
            f00+=1
        elif  word_counts_1[i]==0 and word_counts_2[i]==1:
            f01+=1
        elif word_counts_1[i]==1 and word_counts_2[i]==0:
            f10+=1
        else :
            f11+=1

    smc =(f11+f00)/(f01+f10+f00+f11)
    ########## End ##########
      
    return smc  
  
str1 = "我爱北京天安门"  
str2 = "天安门雄伟壮阔让人不得不爱"  
  
sim1 = smc_similarity(str1, str2)  
  
print(sim1)