一、相似度与相异度
第一关
# 欧几里得相关系数
def euclidean(p, q):
# 如果两数据集数目不同,计算两者之间都对应有的数
same = 0
for i in p:
if i in q:
same += 1
# 计算欧几里德距离,并将其标准化
########## Begin ##########
e = sum([(p[j] - q[j]) ** 2 for j in range(same)])
########## End ##########
return 1 / (1 + e ** 0.5)
print("欧几里得计算出的相似度为",euclidean([1, 2, 3, 4, 5], [2, 4, 6, 8, 10]))
# 余弦相似度
def cosine_similarity(x, y):
xx = 0.0
yy = 0.0
xy = 0.0
for i in range(len(x)):
xx += x[i] * x[i]
yy += y[i] * y[i]
xy += x[i] * y[i]
xx_sqrt = xx ** 0.5
# print(xx)
# print(yy)
# print(xy)
yy_sqrt = yy ** 0.5
cos = xy/(xx_sqrt*yy_sqrt)
return cos
print('余弦相关系数计算出的相似度为',cosine_similarity([5,3],[5,8]))
#泊松相关系数
import scipy
import numpy
from scipy.stats import pearsonr
########## Begin ##########
########## End ##########
y =numpy.array([1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,3])
x =numpy.array([0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,2,2,2,1])
r_row, p_value = pearsonr(x, y)
r_row=0.44908871313907184
print ("用户(UID)84001033与用户(UID)84001003从2004/1/20到4/20/20这段日期的相似度为",r_row)
# print (p_value)
第二关
# (一):找到与目标用户兴趣相似的用户集合
# ########## Begin ##########
# 目标用户(A用户喜欢a、b、d商品)
target_user = {'A':['a','b','d']}
print(f'目标用户:{target_user}')
# 相似用户用户()
alike_user = {'B': ['a','c'],'C': ['b','e'],'D':['c','d','e']}
print(f'相似用户:{alike_user}')
# ########## End ##########
# 倒排表
"""
a A B C
"""
# 总共商品类型
key_value = []
value1 = target_user.values()
for item in value1:
for good in item:
# 如果不再就添加到键值
if good not in key_value:
key_value.append(good)
value2 = alike_user.values()
########## Begin ##########
for item in value2:
for good in item:
if good not in key_value:
key_value.append(good)
print(f'总共商品类型:{key_value}')
########## End ##########
new_table = []
for good in key_value:#遍历所有物品
new_dict = {}
user_list = []
# 目标用户
key_value_list = target_user.items()
#print("key_value_list",key_value_list)
for key_value in key_value_list:
key = key_value[0]#用户名
value = key_value[1]#物品名
if (good in value) & (key not in user_list):
user_list.append(key)
# new_dict[good] = user_list
# new_table.append(new_dict)
# 相似用户
key_value_list = alike_user.items()
# print(key_value_list)
for key_value in key_value_list:
key = key_value[0]
value = key_value[1]
if (good in value) & (key not in user_list):
user_list.append(key)
new_dict[good] = user_list
new_table.append(new_dict)
print(new_table)
########## Begin ##########
# 计算余弦相似度
import pandas as pd
import numpy as np
df = pd.DataFrame(data=np.zeros((4,4)), columns=['A','B','C','D'],index=['A','B','C','D'])
print(df)
# 统计交集
for item in new_table:
print(list(item.values())[0])
label = list(item.values())[0]
x = label[0]
y = label[1]
df.loc[x,y] = df.loc[x,y] + 1
df.loc[y,x] = df.loc[y,x] + 1
print(df)
########## End ##########
# 计算两两之间的相似度
count_list = {}
for i in ['A','B','C','D']:
count = df.loc[i,:].sum()
count_list[i] = count
print(count_list)
# 计算余弦相似度
########## Begin ##########
for i in ['A','B','C','D']:
for j in ['A', 'B', 'C', 'D']:
df.loc[i,j] = df.loc[i,j] / np.sqrt(count_list[i] * count_list[j])
print(df)
########## End ##########
########## Begin ##########
# 计算p(A,c)和p(A,e)
p_Ac = df.loc['A','B'] + df.loc['A','D']
print(f'p(A,c):{p_Ac}')
p_Ae = df.loc['A','C'] + df.loc['A','D']
print(f'p(A,e):{p_Ae}')
########## End ##########
if p_Ac > p_Ae:
print("用户A对c商品更感兴趣,将c商品推荐给A")
elif p_Ac < p_Ae:
print("用户A对e商品更感兴趣,将e商品推荐给A")
else:
print("用户A对c商品和e商品同样感兴趣!")
二、用广义Jaccard系数计算相似度
import numpy as np
###-----------编写代码实现广义Jaccard系数的计算----------
def sim_tonimoto(user1, user2):
common = 0
#1.判断有没有相同的数据, 没有相同数据则返回0
################# Begin #################
for i in range(len(user1)):
if(user1[i]==user2[2]):
common+=1
################# End #################
if common == 0:
return 0
common_num = common
user1_num = len(user1)
user2_num = len(user2)
#2.广义Jaccard系数计算
################# Begin #################
x=user1
y=user2
intersection = set(x).intersection(set(y))
if not intersection:
return 0.0
# 计算并集大小
union = set(x).union(set(y))
# 根据定义计算广义Jaccard相似度
similarity = len(intersection) / len(union)
return similarity
################# End #################
return res
a = [10,20,30] #输入数据1
b = [10,0,30] #输入数据2
#3.调用自己实现的sim_tonimoto函数
################# Begin #################
result=sim_tonimoto(a, b)
################# End #################
print(result) #输出相似度
三、用Cosine计算相似度
import numpy as np
import jieba
jieba.setLogLevel(jieba.logging.INFO)
def cosine_similarity(sentence1: str, sentence2: str) -> float:
#1.实现文本分词
########## Begin ##########
seg1 = [word for word in jieba.cut(sentence1)]
seg2 = [word for word in jieba.cut(sentence2)]
########## End ##########
#2.建立词库
########## Begin ##########
word_list = list(set([word for word in seg1 + seg2]))#建立词库
########## End ##########
word_count_vec_1 = []
word_count_vec_2 = []
for word in word_list:
#3.统计各个文本在词典里出现词的次数
########## Begin ##########
word_count_vec_1.append(seg1.count(word))
word_count_vec_2.append(seg2.count(word))
########## End ##########
#print(word_list)
vec_1 = np.array(word_count_vec_1)
vec_2 = np.array(word_count_vec_2)
#print(vec_1)
#print(vec_2)
#4.余弦公式
########## Begin ##########
x1=vec_1.dot(vec_2)
x2=np.linalg.norm(vec_1)*np.linalg.norm(vec_2)
#for i in range(0,len(word_list)):
# x2*=np.sqrt(vec_1[i]**2+vec_2[i]**2)
cos=x1/x2
########## End ##########
return cos
str1="湖南是一个好地方"
str2="湖南好吃的在哪里"
sim1=cosine_similarity(str1,str2)
print(sim1)
四、用Jaccard系数计算文本之间的相似度
#import numpy as np
#from scipy.spatial.distance import pdist#直接调包可以计算JC值 ,需要两个句子长度一样;
import jieba
jieba.setLogLevel(jieba.logging.INFO)
def Jaccrad(model, reference):#terms_reference为源句子,terms_model为候选句子
#1.分词
########## Begin ##########
terms_reference=jieba.cut(reference)
terms_model=jieba.cut(model)
########## End ##########
grams_reference = list(set(terms_reference))
grams_model = list(set(terms_model))
#2.计算交集
########## Begin ##########
temp=0
for i in grams_reference:
if i in grams_model:
temp=temp+1
########## End ##########
fenmu=len(grams_model)+len(grams_reference)-temp #并集
#3.计算Jaccard系数
########## Begin ##########
jaccard_coefficient=temp/fenmu
########## End ##########
return jaccard_coefficient
str1="我爱北京天安门"
str2="天安门雄伟壮阔让人不得不爱"
jaccard_coefficient=Jaccrad(str1,str2)
print(jaccard_coefficient)
五、用SMC相似度计算文本之间的相似度
import numpy as np
import jieba
jieba.setLogLevel(jieba.logging.INFO)
def smc_similarity(sentence1: str, sentence2: str) -> float:
# 1. 实现文本分词
########## Begin ##########
seg1 =[word for word in jieba.cut(sentence1)]
seg2 =[word for word in jieba.cut(sentence2)]
########## End ##########
# 2. 建立词库
########## Begin ##########
word_list = list(set([word for word in seg1 + seg2]))#建立词库
########## End ##########
# 3. 统计各个文本在词典里出现词的次数
########## Begin ##########
word_counts_1 = [seg1.count(word) for word in word_list]
word_counts_2 = [seg2.count(word) for word in word_list]
########## End ##########
# 4. 余弦公式
########## Begin ##########
f00=0
f01 =0
f10 = 0
f11 = 0
for i in range(0,len(word_counts_1)):
if word_counts_1[i]==0 and word_counts_2[i]==0:
f00+=1
elif word_counts_1[i]==0 and word_counts_2[i]==1:
f01+=1
elif word_counts_1[i]==1 and word_counts_2[i]==0:
f10+=1
else :
f11+=1
smc =(f11+f00)/(f01+f10+f00+f11)
########## End ##########
return smc
str1 = "我爱北京天安门"
str2 = "天安门雄伟壮阔让人不得不爱"
sim1 = smc_similarity(str1, str2)
print(sim1)