协同过滤
推荐系统
In [1]:
import numpy as np from scipy.optimize import minimize import pandas as pd
In [2]:
def getRecommender(Y, R, params=None, n=10, theLambda=10, maxIter=100): """ Args: Y - 用户对影片的评分矩阵 R - 用户j是否对影片i评分的矩阵 (0/1) params - 若有初始化参数,可在此传入(Theta, X) n - 电影的特征数 theLambda - 正则化参数 maxIter - 最大迭代次数 Returns: train - 训练函数 predict - 预测函数 getTopRecommends - 获取特定影片的最相似推荐 """ # 影片数,用户数 nm, nu = Y.shape # 标准化影片的评分 mu = np.zeros((Y.shape[0], 1), dtype=np.float) for i in range(nm): totalRates = np.sum(Y[i]) validCount = len(np.nonzero(R[i])[0]) mu[i] = totalRates / validCount Y = Y - mu def roll(Theta, X): """ 对于模型而言,Theta和X都是待学习的参数,需要放在一起直接优化 Args: Theta - 用户偏好矩阵 X - 电影特征矩阵 Returns: vector - 折叠后的参数 """ #return np.hstack((X.A.T.flatten(), Theta.A.T.flatten())) return np.hstack((X.A.T.flatten(), Theta.A.T.flatten())) def unroll(vector): """ Args: vector 参数向量 Returns: Theta - 用户偏好矩阵 X - 电影特征矩阵 """ X = np.mat(vector[:nm * n].reshape(n, nm).T) Theta = np.mat(vector[nm * n:].reshape(n, nu).T) return Theta, X def initParams(): """初始化参数 Returns: Theta - 用户偏好矩阵 X - 电影特征矩阵 """ Theta = np.mat(np.random.rand(nu, n)) X = np.mat(np.random.rand(nm, n)) return Theta, X def regularize(param): """对参数进行正则化 Args: param - 参数 Return: regParam - 正规化后的参数 """ return theLambda * 0.5 * np.sum(np.power(param, 2)) def J(params): """代价函数 Args: params - 参数向量 nu - 用户数 nm - 电影数 n - 特征数 Return: J - 预测代价 """ # 参数展开 Theta, X = unroll(params) # 计算误差 rows, cols = np.nonzero(R) # 预测 h = predict(Theta, X) diff = h - Y diff[R != 1] = 0 error = 0.5 * np.sum(np.power(diff, 2)) # 正则化 Theta regTheta = regularize(Theta) # 正规化 x regX = regularize(X) return error + regTheta + regX def gradient(params): """计算梯度 Args: params - 参数向量 Returns: grad - 梯度向量 """ Theta, X = unroll(params) # 当前梯度初始化成0 ThetaGrad = np.mat(np.zeros(Theta.shape)) XGrad = np.mat(np.zeros(X.shape)) error = predict(Theta, X) - Y error[R != 1] = 0 # 这里只需要计算梯度 ThetaGrad = error.T * X + theLambda * Theta XGrad = error * Theta + theLambda * X return roll(ThetaGrad, XGrad) def train(): """训练 Returns: Theta - 用户偏好矩阵 X - 电影特征矩阵 """ # 初始化参数 if not params: Theta, X = initParams() else: Theta = params['Theta'] X = params['X'] # 最小化目标函数 res = minimize(J, x0=roll(Theta, X), jac=gradient, method='CG', options={'disp': True, 'maxiter': maxIter}) Theta, X = unroll(res.x) return Theta, X def predict(Theta, X): """预测 Args: Theta - 用户偏好矩阵 X - 电影特征矩阵 Return: h 预测 """ return X * Theta.T + mu def getTopRecommends(Theta, X, i, count, rated, items): """获得推荐 Args: Theta - 用户偏好矩阵 X - 影片特征矩阵 i - 用户索引 count - 目标推荐数量 rated - 已经评价的影片id items - 影片库 Returns: topRecommends - 推荐项目 """ predictions = predict(Theta, X)[:, i] # 实用pandas的DataFrame可以将不同类型数据放在一个Frame中,方便排序等操作 # 相较而言,numpy的多维数组要求内部类型完全一致 df = pd.DataFrame(data=predictions, columns=['prediction',]) df['movie'] = items df.sort_values(by='prediction', ascending=False,inplace=True) # 不推荐已经评过分的影片 df.drop(rated, inplace=True) return df[0:count] return train, predict, getTopRecommends
In [3]:
from scipy.io import loadmat
In [4]:
data = loadmat('data/ex8_movies.mat') Y = data['Y'] R = data['R']
In [5]:
movieParams = loadmat('data/ex8_movieParams.mat') nm = movieParams['num_movies'][0,0] n = movieParams['num_features'][0,0]
In [6]:
def getMovie(line): return b' '.join(line.split()[1:]) movieList = [] with open('data/movie_ids.txt', 'rb') as f: for line in f: movieList.append(getMovie(line.strip()))
In [7]:
myRatings = np.mat(np.zeros((nm,1))) myRatings[0] = 4 myRatings[97] = 2 myRatings[6] = 3 myRatings[11] = 5 myRatings[53] = 4 myRatings[63] = 5 myRatings[65] = 3 myRatings[68] = 5 myRatings[182] = 4 myRatings[225] = 5 myRatings[354] = 5 print(u'我的评分:') for i in range(nm): if myRatings[i] > 0: print('{:<50} {}'.format( movieList[i].decode('utf-8'), myRatings[i].A[0,0]))
我的评分: Toy Story (1995) 4.0 Twelve Monkeys (1995) 3.0 Usual Suspects, The (1995) 5.0 Outbreak (1995) 4.0 Shawshank Redemption, The (1994) 5.0 While You Were Sleeping (1995) 3.0 Forrest Gump (1994) 5.0 Silence of the Lambs, The (1991) 2.0 Alien (1979) 4.0 Die Hard 2 (1990) 5.0 Sphere (1998) 5.0
In [8]:
# 将我们的新用户数据加入 Y = np.column_stack((myRatings, Y)) R = np.column_stack((myRatings, R)).astype(bool)
In [9]:
train, predict, getTopRecommends = getRecommender( Y, R, n=n, theLambda=10.0)
In [10]:
Theta, X = train()
Warning: Maximum number of iterations has been exceeded. Current function value: 71136.873769 Iterations: 100 Function evaluations: 156 Gradient evaluations: 156
In [11]:
rated = np.nonzero(myRatings)[0].tolist() # -1 就是我们刚才加入的最新用户 topRecommends = getTopRecommends(Theta, X, -1, 10, rated, movieList) topRecommends
Out[11]:
prediction | movie | |
---|---|---|
813 | 4.760439 | b'Great Day in Harlem, A (1994)' |
1598 | 4.249605 | b"Someone Else's America (1995)" |
1652 | 3.786093 | b'Entertaining Angels: The Dorothy Day Story (... |
1535 | 3.749902 | b'Aiqing wansui (1994)' |
1499 | 3.680961 | b'Santa with Muscles (1996)' |
1200 | 3.648143 | b'Marlene Dietrich: Shadow and Light (1996)' |
1121 | 3.638303 | b'They Made Me a Criminal (1939)' |
1497 | 3.398012 | b'Farmer & Chase (1995)' |
1491 | 3.396144 | b'Window to Paris (1994)' |
1612 | 3.246720 | b'Tokyo Fist (1995)' |