用python实现协同过滤的推荐系统逻辑

import numpy as np
from scipy.optimize import minimize
import pandas as pd

In [2]:

def getRecommender(Y, R, params=None, n=10, theLambda=10, maxIter=100):
    """
    Args:
        Y - 用户对影片的评分矩阵
        R - 用户j是否对影片i评分的矩阵 (0/1)
        params - 若有初始化参数,可在此传入(Theta, X)
        n - 电影的特征数
        theLambda - 正则化参数
        maxIter - 最大迭代次数
    Returns:
        train - 训练函数
        predict - 预测函数
        getTopRecommends - 获取特定影片的最相似推荐
    """

    # 影片数,用户数
    nm, nu = Y.shape

    # 标准化影片的评分
    mu = np.zeros((Y.shape[0], 1), dtype=np.float)
    for i in range(nm):
        totalRates = np.sum(Y[i])
        validCount = len(np.nonzero(R[i])[0])
        mu[i] = totalRates / validCount
    Y = Y - mu

    def roll(Theta, X):
        """
        对于模型而言,Theta和X都是待学习的参数,需要放在一起直接优化
        Args:
            Theta - 用户偏好矩阵
            X - 电影特征矩阵
        Returns:
            vector - 折叠后的参数
        """

        #return np.hstack((X.A.T.flatten(), Theta.A.T.flatten()))
        return np.hstack((X.A.T.flatten(), Theta.A.T.flatten()))

    def unroll(vector):
        """
        Args:
            vector 参数向量
        Returns:
            Theta - 用户偏好矩阵
            X - 电影特征矩阵
        """
        X = np.mat(vector[:nm * n].reshape(n, nm).T)
        Theta = np.mat(vector[nm * n:].reshape(n, nu).T)
        return Theta, X

    def initParams():
        """初始化参数

        Returns:
            Theta - 用户偏好矩阵
            X - 电影特征矩阵
        """
        Theta = np.mat(np.random.rand(nu, n))
        X = np.mat(np.random.rand(nm, n))
        return Theta, X

    def regularize(param):
        """对参数进行正则化
        Args:
            param - 参数
        Return:
            regParam - 正规化后的参数
        """
        return theLambda * 0.5 * np.sum(np.power(param, 2))

    def J(params):
        """代价函数

        Args:
            params - 参数向量
            nu - 用户数
            nm - 电影数
            n - 特征数
        Return:
            J - 预测代价
        """
        # 参数展开
        Theta, X = unroll(params)
        # 计算误差
        rows, cols = np.nonzero(R)
        # 预测
        h = predict(Theta, X)
        diff = h - Y
        diff[R != 1] = 0
        error = 0.5 * np.sum(np.power(diff, 2))
        
        #  正则化 Theta
        regTheta = regularize(Theta)
        #  正规化 x
        regX = regularize(X)
        
        return error + regTheta + regX

    def gradient(params):
        """计算梯度

        Args:
            params - 参数向量
        Returns:
            grad - 梯度向量
        """
        Theta, X = unroll(params)
        
        # 当前梯度初始化成0
        ThetaGrad = np.mat(np.zeros(Theta.shape))
        XGrad = np.mat(np.zeros(X.shape))
        
        error = predict(Theta, X) - Y
        error[R != 1] = 0
        
        # 这里只需要计算梯度
        ThetaGrad = error.T * X + theLambda * Theta
        XGrad =  error * Theta + theLambda * X
        
        return roll(ThetaGrad, XGrad)

    def train():
        """训练

        Returns:
            Theta - 用户偏好矩阵
            X - 电影特征矩阵
        """
        # 初始化参数
        if not params:
            Theta, X = initParams()
        else:
            Theta = params['Theta']
            X = params['X']
            
        # 最小化目标函数
        res = minimize(J, x0=roll(Theta, X), jac=gradient,
                       method='CG', options={'disp': True, 'maxiter': maxIter})
        Theta, X = unroll(res.x)
        return Theta, X

    def predict(Theta, X):
        """预测
        Args:
            Theta - 用户偏好矩阵
            X - 电影特征矩阵
        Return:
            h 预测
        """
        return X * Theta.T + mu

    def getTopRecommends(Theta, X, i, count, rated, items):
        """获得推荐

        Args:
            Theta - 用户偏好矩阵
            X - 影片特征矩阵
            i - 用户索引
            count - 目标推荐数量
            rated - 已经评价的影片id
            items - 影片库
        Returns:
            topRecommends - 推荐项目
        """
        predictions = predict(Theta, X)[:, i]
        
        # 实用pandas的DataFrame可以将不同类型数据放在一个Frame中,方便排序等操作
        # 相较而言,numpy的多维数组要求内部类型完全一致
        df = pd.DataFrame(data=predictions, columns=['prediction',])
        df['movie'] = items
        df.sort_values(by='prediction', ascending=False,inplace=True)
        # 不推荐已经评过分的影片
        df.drop(rated, inplace=True)
        
        return df[0:count]

    return train, predict, getTopRecommends

In [ ]:

 

In [3]:

from scipy.io import loadmat

In [4]:

data = loadmat('data/ex8_movies.mat')
Y = data['Y']
R = data['R']

In [5]:

movieParams = loadmat('data/ex8_movieParams.mat')
nm = movieParams['num_movies'][0,0]
n = movieParams['num_features'][0,0]

In [6]:

def getMovie(line):
    return ' '.join(line.split()[1:])
movieList = []
with open('data/movie_ids.txt') as f:
    for line in f:
        movieList.append(getMovie(line.strip()))

In [7]:

myRatings = np.mat(np.zeros((nm,1)))

myRatings[0] = 4
myRatings[97] = 2
myRatings[6] = 3
myRatings[11] = 5
myRatings[53] = 4
myRatings[63] = 5
myRatings[65] = 3
myRatings[68] = 5
myRatings[182] = 4
myRatings[225] = 5
myRatings[354] = 5

print u'我的评分:'
for i in range(nm):
    if myRatings[i] > 0:
        print u'{:<50} {}'.format( movieList[i], myRatings[i].A[0,0])
我的评分:
Toy Story (1995)                                   4.0
Twelve Monkeys (1995)                              3.0
Usual Suspects, The (1995)                         5.0
Outbreak (1995)                                    4.0
Shawshank Redemption, The (1994)                   5.0
While You Were Sleeping (1995)                     3.0
Forrest Gump (1994)                                5.0
Silence of the Lambs, The (1991)                   2.0
Alien (1979)                                       4.0
Die Hard 2 (1990)                                  5.0
Sphere (1998)                                      5.0

In [8]:

# 将我们的新用户数据加入
Y = np.column_stack((myRatings, Y))
R = np.column_stack((myRatings, R)).astype(bool)

In [9]:

train, predict, getTopRecommends = getRecommender(
    Y, R, n=n, theLambda=10.0)

In [10]:

Theta, X = train()
Warning: Maximum number of iterations has been exceeded.
         Current function value: 71147.130011
         Iterations: 100
         Function evaluations: 167
         Gradient evaluations: 167

In [11]:

rated = np.nonzero(myRatings)[0].tolist()
# -1 就是我们刚才加入的最新用户
topRecommends = getTopRecommends(Theta, X, -1, 10, rated, movieList)
topRecommends

Out[11]:

 predictionmovie
8134.659971Great Day in Harlem, A (1994)
15984.131668Someone Else's America (1995)
16523.854358Entertaining Angels: The Dorothy Day Story (1996)
11213.640033They Made Me a Criminal (1939)
15353.633231Aiqing wansui (1994)
12003.619692Marlene Dietrich: Shadow and Light (1996)
12923.317189Star Kid (1997)
14993.303447Santa with Muscles (1996)
16123.221008Tokyo Fist (1995)
14973.137354Farmer & Chase (1995)

 

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值