数据:
数据及代码资源
1 相似度
俗话说,物以类聚人以群分。因此相距越近的个体越相似
比如距离越近越相似,两个物体的夹角越小越相似。
1.1 Correlation-based Similarity
公式
1.2 Adjusted Cosine Similarity
公式
2 评分预测
根据相似度矩阵和评分矩阵推测用户的评分可能性
公式
3 代码
import numpy as np
import torch
from sklearn.model_selection import KFold
# -------------记录运行时间 -------------------
import time
from time import strftime
from time import gmtime
'''
nu = 943
ni = 1682
'''
def data_process(data, nu, ni):
n_data = len(data)
user_item = np.zeros((nu, ni), dtype='f4')
for i in range(n_data):
user_item[data[i,0] - 1, data[i,1] - 1] = data[i, 2]
return user_item
def pearsonSimilarity(data):
data_torch = torch.tensor(data)
nu, ni = data.shape
item_similarity = np.zeros((ni, ni))
for i in range(ni):
if i == ni - 1: continue
for j in range(i + 1, ni):
i_rating= data_torch[:, i]
j_rating= data_torch[:, j]
co_rating = torch.multiply(i_rating, j_rating)
co_index = torch.nonzero(co_rating, as_tuple=True)
# 没有共同评价的user
if len(co_index[0]) == 0: continue
i_rating = torch.index_select(i_rating, dim = 0, index=co_index[0])
j_rating = torch.index_select(j_rating, dim = 0, index=co_index[0])
i_average = torch.mean((i_rating))
j_average = torch.mean((j_rating))
i_sub_rating = i_rating - i_average
j_sub_rating = j_rating - j_average
molecule = torch.dot(i_sub_rating, j_sub_rating)
denominator1 = np.sqrt(torch.dot(i_sub_rating, i_sub_rating))
denominator2 = np.sqrt(torch.dot(j_sub_rating, j_sub_rating))
if denominator1 * denominator2 == 0: continue
item_similarity[i, j] = item_similarity[j, i] = molecule / (denominator1 * denominator2)
return item_similarity
def adjustedCosineSimilarity(data):
data_torch = torch.tensor(data)
nu, ni = data.shape
u_average = torch.sum(data_torch, axis = 1) / torch.sum(data_torch > 0, axis=1)
item_similarity = np.zeros((ni, ni))
for i in range(ni):
if i == ni - 1: continue
for j in range(i + 1, ni):
i_rating= data_torch[:, i]
j_rating= data_torch[:, j]
co_rating = torch.multiply(i_rating, j_rating)
co_index = torch.nonzero(co_rating, as_tuple=True)
# 没有共同评价的user
if len(co_index[0]) == 0: continue
i_rating = torch.index_select(i_rating, dim = 0, index=co_index[0])
j_rating = torch.index_select(j_rating, dim = 0, index=co_index[0])
aver_rating = torch.index_select(u_average, dim = 0, index=co_index[0])
i_sub_rating = i_rating - aver_rating
j_sub_rating = j_rating - aver_rating
molecule = torch.dot(i_sub_rating, j_sub_rating)
denominator1 = np.sqrt(torch.dot(i_sub_rating, i_sub_rating))
denominator2 = np.sqrt(torch.dot(j_sub_rating, j_sub_rating))
if denominator1 * denominator2 == 0: continue
item_similarity[i, j] = item_similarity[j, i] = molecule / (denominator1 * denominator2)
return item_similarity
def weighted_sum(data, similarity):
nu, ni = data.shape
data_torch = torch.tensor(data)
similarity_torch = torch.tensor(similarity)
predict_data = np.zeros((nu, ni))
for u in range(nu):
for i in range(ni):
i_similary = similarity_torch[:,i]
u_rating = data_torch[u,:]
i_similary = torch.where(i_similary <= 0, 0, i_similary)
co_user_item = torch.multiply(u_rating, i_similary)
co_index = torch.nonzero(co_user_item, as_tuple=True)
# 没有共同评价的索引
if len(co_index[0]) == 0: continue
u_rating = torch.index_select(u_rating, dim=0, index=co_index[0])
i_similary = torch.index_select(i_similary, dim=0, index=co_index[0])
molecule = torch.sum(u_rating * i_similary)
denominator = torch.sum(torch.abs(i_similary))
predict_data[u, i] = molecule / denominator
return predict_data
if __name__ == '__main__':
time_begin = time.time() # 程序运行开始时间
# user_id, item_id, rating: 943, 1682; 1408 2000; 943 50; 943 198
nu = 943
ni = 50
# raw_data = np.loadtxt('F:/课题组学习/数据/198item.data', dtype=int)
raw_data = np.loadtxt('F:/课题组学习/数据/50item.csv', delimiter=',', dtype=int)
n_fold = 5
MAEs = []
RMSEs = []
proceed_data = torch.tensor(raw_data[:, 0: 3])
kf = KFold(n_splits=n_fold, shuffle=True) # K重交叉验证,5折验证
all_data = data_process(raw_data[:, 0: 3], nu, ni)
# similary = adjustedCosineSimilarity(all_data)
# similary = pearsonSimilarity(all_data)
for train_index, test_index in kf.split(raw_data):
mae = 0
rmse = 0
n = 0
# 训练集
train = data_process(raw_data[train_index, 0: 3], nu, ni)
similary = adjustedCosineSimilarity(train)
# 测试集
test = data_process(raw_data[test_index, 0: 3], nu, ni)
train_mean = np.mean(train[train > 0])
predict_data = weighted_sum(train, similary)
predict_data[predict_data == 0] = train_mean # 没有预测出来的值用训练集的均值代替
for u in range(nu):
for i in range(ni):
if test[u, i] > 0:
mae = mae + abs(predict_data[u, i] - test[u, i])
rmse = rmse + pow(predict_data[u, i] - test[u, i], 2)
n = n + 1
MAEs.append(mae / n)
RMSEs.append(pow(rmse / n, 0.5))
# ----------------------------控制输出格式 start -------------------------------
for i in range(0, n_fold):
print(" " * 8, "Fold", i + 1, end="")
print("\n", "-" * (16 * n_fold), "\n MAE ", end="")
for MAE in MAEs:
print(format(MAE, '.6f'), 6 * " ", end="")
print("\n RMSE ", end="")
for RMSE in RMSEs:
print(format(RMSE, '.6f'), 6 * " ", end="")
print("\n", "-" * (16 * n_fold))
# ----------------------------控制输出格式 end ---------------------------------
time_end = time.time() # 程序运行结束时间
run_time = time_end - time_begin # 程序运行时间
print(' time:', strftime("%H:%M:%S", gmtime(run_time)))
4 效果
5 总结
- 基于物品的协同过滤适合实时系统,因为可以提前计算相似度矩阵
- 基于物品的协同过滤适合稀疏的user-item矩阵