基于物品的协同过滤算法

数据:数据及代码资源

1 相似度

俗话说,物以类聚人以群分。因此相距越近的个体越相似
比如距离越近越相似,两个物体的夹角越小越相似。
在这里插入图片描述

1.1 Correlation-based Similarity

公式在这里插入图片描述
在这里插入图片描述

1.2 Adjusted Cosine Similarity

公式
在这里插入图片描述
在这里插入图片描述

2 评分预测

根据相似度矩阵和评分矩阵推测用户的评分可能性
公式
在这里插入图片描述
在这里插入图片描述

3 代码

import numpy as np
import torch
from sklearn.model_selection import KFold
# -------------记录运行时间 -------------------
import time
from time import strftime
from time import gmtime
'''
nu = 943
ni = 1682
'''
def data_process(data, nu, ni):
    n_data = len(data)
    user_item = np.zeros((nu, ni), dtype='f4')
    for i in range(n_data):
        user_item[data[i,0] - 1, data[i,1] - 1] = data[i, 2]
    return user_item

def pearsonSimilarity(data):
    data_torch = torch.tensor(data)
    nu, ni = data.shape

    item_similarity = np.zeros((ni, ni))
    for i in range(ni):
        if i == ni - 1: continue
        for j in range(i + 1, ni):
            i_rating= data_torch[:, i]
            j_rating= data_torch[:, j]
            co_rating = torch.multiply(i_rating, j_rating)
            co_index = torch.nonzero(co_rating, as_tuple=True)
            # 没有共同评价的user
            if len(co_index[0]) == 0: continue

            i_rating = torch.index_select(i_rating, dim = 0, index=co_index[0])
            j_rating = torch.index_select(j_rating, dim = 0, index=co_index[0])
            i_average = torch.mean((i_rating))
            j_average = torch.mean((j_rating))

            i_sub_rating = i_rating - i_average
            j_sub_rating = j_rating - j_average

            molecule = torch.dot(i_sub_rating, j_sub_rating)
            denominator1 = np.sqrt(torch.dot(i_sub_rating, i_sub_rating))
            denominator2 = np.sqrt(torch.dot(j_sub_rating, j_sub_rating))
            if denominator1 * denominator2 == 0: continue
            item_similarity[i, j] = item_similarity[j, i] = molecule / (denominator1 * denominator2)

    return item_similarity

def adjustedCosineSimilarity(data):
    data_torch = torch.tensor(data)
    nu, ni = data.shape
    u_average = torch.sum(data_torch, axis = 1) / torch.sum(data_torch > 0, axis=1)

    item_similarity = np.zeros((ni, ni))
    for i in range(ni):
        if i == ni - 1: continue
        for j in range(i + 1, ni):
            i_rating= data_torch[:, i]
            j_rating= data_torch[:, j]
            co_rating = torch.multiply(i_rating, j_rating)
            co_index = torch.nonzero(co_rating, as_tuple=True)
            # 没有共同评价的user
            if len(co_index[0]) == 0: continue

            i_rating = torch.index_select(i_rating, dim = 0, index=co_index[0])
            j_rating = torch.index_select(j_rating, dim = 0, index=co_index[0])
            aver_rating = torch.index_select(u_average, dim = 0, index=co_index[0])

            i_sub_rating = i_rating - aver_rating
            j_sub_rating = j_rating - aver_rating

            molecule = torch.dot(i_sub_rating, j_sub_rating)
            denominator1 = np.sqrt(torch.dot(i_sub_rating, i_sub_rating))
            denominator2 = np.sqrt(torch.dot(j_sub_rating, j_sub_rating))
            if denominator1 * denominator2 == 0: continue
            item_similarity[i, j] = item_similarity[j, i] = molecule / (denominator1 * denominator2)

    return item_similarity

def weighted_sum(data, similarity):
    nu, ni = data.shape
    data_torch = torch.tensor(data)
    similarity_torch = torch.tensor(similarity)
    predict_data = np.zeros((nu, ni))
    for u in range(nu):
        for i in range(ni):
            i_similary = similarity_torch[:,i]
            u_rating = data_torch[u,:]

            i_similary = torch.where(i_similary <= 0, 0, i_similary)
            co_user_item = torch.multiply(u_rating, i_similary)
            co_index = torch.nonzero(co_user_item, as_tuple=True)
            # 没有共同评价的索引
            if len(co_index[0]) == 0: continue

            u_rating = torch.index_select(u_rating, dim=0, index=co_index[0])
            i_similary = torch.index_select(i_similary, dim=0, index=co_index[0])

            molecule = torch.sum(u_rating * i_similary)
            denominator = torch.sum(torch.abs(i_similary))
            predict_data[u, i] = molecule / denominator

    return predict_data

if __name__ == '__main__':
    time_begin = time.time()  # 程序运行开始时间
    # user_id, item_id, rating:  943, 1682;   1408  2000;    943  50;    943  198
    nu = 943
    ni = 50
    # raw_data = np.loadtxt('F:/课题组学习/数据/198item.data', dtype=int)
    raw_data = np.loadtxt('F:/课题组学习/数据/50item.csv', delimiter=',', dtype=int)
    n_fold = 5
    MAEs = []
    RMSEs = []

    proceed_data = torch.tensor(raw_data[:, 0: 3])
    kf = KFold(n_splits=n_fold, shuffle=True)  # K重交叉验证,5折验证

    all_data = data_process(raw_data[:, 0: 3], nu, ni)
    # similary = adjustedCosineSimilarity(all_data)
    # similary = pearsonSimilarity(all_data)

    for train_index, test_index in kf.split(raw_data):
        mae = 0
        rmse = 0
        n = 0
        # 训练集
        train = data_process(raw_data[train_index, 0: 3], nu, ni)
        similary = adjustedCosineSimilarity(train)
        # 测试集
        test = data_process(raw_data[test_index, 0: 3], nu, ni)
        train_mean = np.mean(train[train > 0])
        predict_data = weighted_sum(train, similary)
        predict_data[predict_data == 0] = train_mean     # 没有预测出来的值用训练集的均值代替
        for u in range(nu):
            for i in range(ni):
                if test[u, i] > 0:
                    mae = mae + abs(predict_data[u, i] - test[u, i])
                    rmse = rmse + pow(predict_data[u, i] - test[u, i], 2)
                    n = n + 1
        MAEs.append(mae / n)
        RMSEs.append(pow(rmse / n, 0.5))
    # ----------------------------控制输出格式 start -------------------------------
    for i in range(0, n_fold):
        print(" " * 8, "Fold", i + 1, end="")
    print("\n", "-" * (16 * n_fold), "\n  MAE   ", end="")
    for MAE in MAEs:
        print(format(MAE, '.6f'), 6 * " ", end="")
    print("\n  RMSE  ", end="")
    for RMSE in RMSEs:
        print(format(RMSE, '.6f'), 6 * " ", end="")
    print("\n", "-" * (16 * n_fold))
    # ----------------------------控制输出格式 end ---------------------------------
    time_end = time.time()  # 程序运行结束时间
    run_time = time_end - time_begin   # 程序运行时间
    print('  time:', strftime("%H:%M:%S", gmtime(run_time)))

4 效果

在这里插入图片描述

5 总结

  1. 基于物品的协同过滤适合实时系统,因为可以提前计算相似度矩阵
  2. 基于物品的协同过滤适合稀疏的user-item矩阵
  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值