# -*- coding: utf-8 -*-
"""
Created on Sat Jul 4 16:49:40 2020
@author: cheetah023
"""
import numpy as np
import scipy.io as sci
import matplotlib.pyplot as plt
import scipy.optimize as opt
#函数定义
def cofiCostFunc(params, Y, R, num_users,
num_movies, num_features, lamda):
J = 0
X = np.reshape(params[0:num_movies*num_features],[num_movies,num_features])
Theta = np.reshape(params[num_movies*num_features:],[num_users,num_features])
J = np.sum(((np.dot(X,Theta.T) - Y) ** 2) * R) / 2
reg = (lamda / 2) * (np.sum(Theta ** 2) + np.sum(X ** 2))
J = J + reg
return J
def cofiGradient(params, Y, R, num_users,
num_movies, num_features, lamda):
X = np.reshape(params[0:num_movies*num_features],[num_movies,num_features])
Theta = np.reshape(params[num_movies*num_features:],[num_users,num_features])
X_grad = ((X @ Theta.T - Y) * R) @ Theta + lamda * X
Theta_grad = ((X @ Theta.T - Y) * R).T @ X + lamda * Theta
grad = np.r_[X_grad.flatten(),Theta_grad.flatten()]
return grad
def loadMovieList():
fid = open('movie_ids.txt','r',encoding='Latin-1')
n =1682
movieList = []
for line in fid:
idx = line.find(' ') + 1
movieList.append(line[idx:])
fid.close()
return movieList
def normalizeRatings(Y, R):
m,n = Y.shape
y_mean = np.sum(Y,axis=1) / np.sum(R,axis=1)
y_mean = y_mean.reshape([m,1])
y_mean_t = np.tile(y_mean,(1,n))
y_norm = (Y - y_mean_t) * R
return y_norm, y_mean
#Part 1: Loading movie ratings dataset
data = sci.loadmat('ex8_movies.mat')
#print('data.keys',data.keys())
Y = data['Y']
R = data['R']
print('Y:',Y.shape)
print('R:',R.shape)
score1 = np.sum(Y[0,:]) / np.sum(R[0,:])
print('Average rating for movie 1 (Toy Story):',score1);
plt.figure(0)
plt.imshow(Y)
plt.xlabel('Users')
plt.ylabel('Movies')
#Part 2: Collaborative Filtering Cost Function
data = sci.loadmat('ex8_movieParams.mat')
#print('data',data.keys())
X = data['X']
Theta = data['Theta']
num_users = data['num_users']
num_movies = data['num_movies']
num_features = data['num_features']
#减少数量,跑得快
num_users = 4; num_movies = 5; num_features = 3
X = X[0:num_movies, 0:num_features]
Theta = Theta[0:num_users, 0:num_features]
Y = Y[0:num_movies, 0:num_users]
R = R[0:num_movies, 0:num_users]
params = np.r_[X.flatten(),Theta.flatten()]
J = cofiCostFunc(params, Y, R, num_users,
num_movies,num_features, 0)
print('Cost at loaded parameters:',J)
print('(this value should be about 22.22)')
#Part 3: Collaborative Filtering Gradient
#检查梯度的函数没写
#Part 4: Collaborative Filtering Cost Regularization
J = cofiCostFunc(params, Y, R, num_users,
num_movies,num_features, 1.5)
print('Cost at loaded parameters (lambda = 1.5):',J)
print('(this value should be about 31.34)')
#Part 5: Collaborative Filtering Gradient Regularization
#检查梯度的函数没写
#Part 6: Entering ratings for a new user
movieList = loadMovieList()
my_ratings = np.zeros([1682, 1])
my_ratings[0] = 4
my_ratings[97] = 2
my_ratings[6] = 3
my_ratings[11]= 5
my_ratings[53] = 4
my_ratings[63]= 5
my_ratings[65]= 3
my_ratings[68] = 5
my_ratings[182] = 4
my_ratings[225] = 5
my_ratings[354]= 5
for i in range(0, len(my_ratings)):
if my_ratings[i] > 0:
print('Rated %d for %s'%(my_ratings[i],movieList[i]))
#Part 7: Learning Movie Ratings
data = sci.loadmat('ex8_movies.mat')
Y = data['Y']
R = data['R']
Y = np.column_stack([my_ratings,Y])
R_myrating = my_ratings != 0
R = np.column_stack([R_myrating,R])
[Ynorm, Ymean] = normalizeRatings(Y, R)
num_users = Y.shape[1]
num_movies = Y.shape[0]
num_features = 10;
X = np.random.random([num_movies,num_features])
theta = np.random.random([num_users,num_features])
initial_params = np.r_[X.flatten(),theta.flatten()]
lamda = 10
params = opt.fmin_cg(f=cofiCostFunc,x0=initial_params,
args=(Ynorm, R, num_users,num_movies, num_features, lamda),
fprime=cofiGradient,
maxiter=100,disp=True)
X = np.reshape(params[0:num_movies*num_features],[num_movies,num_features])
theta = np.reshape(params[num_movies*num_features:],[num_users,num_features])
P = X @ theta.T
my_predictions = P[:,0] + Ymean.flatten()
print('P',P.shape)
print('Ymean',Ymean.shape)
print('my_predictions',my_predictions.shape)
idx = np.argsort(my_predictions)[::-1]
print('idx',idx.shape)
for i in range(0,10):
j = idx[i]
print('Predicting rating ',my_predictions[j])
print('for ',movieList[j])
运行结果:
Y: (1682, 943)
R: (1682, 943)
Average rating for movie 1 (Toy Story): 3.8783185840707963
Cost at loaded parameters: 22.224603725685675
(this value should be about 22.22)
Cost at loaded parameters (lambda = 1.5): 31.34405624427422
(this value should be about 31.34)
Rated 4 for Toy Story (1995)
Rated 3 for Twelve Monkeys (1995)
Rated 5 for Usual Suspects, The (1995)
Rated 4 for Outbreak (1995)
Rated 5 for Shawshank Redemption, The (1994)
Rated 3 for While You Were Sleeping (1995)
Rated 5 for Forrest Gump (1994)
Rated 2 for Silence of the Lambs, The (1991)
Rated 4 for Alien (1979)
Rated 5 for Die Hard 2 (1990)
Rated 5 for Sphere (1998)
Warning: Maximum number of iterations has been exceeded.
Current function value: 38956.345281
Iterations: 100
Function evaluations: 153
Gradient evaluations: 153
P (1682, 944)
Ymean (1682, 1)
my_predictions (1682,)
idx (1682,)
Predicting rating 5.00000042235092
for Marlene Dietrich: Shadow and Light (1996)
Predicting rating 5.000000293775594
for Santa with Muscles (1996)
Predicting rating 5.000000254944607
for They Made Me a Criminal (1939)
Predicting rating 5.000000222955008
for Entertaining Angels: The Dorothy Day Story (1996)
Predicting rating 5.000000130674626
for Saint of Fort Washington, The (1993)
Predicting rating 5.000000102308117
for Great Day in Harlem, A (1994)
Predicting rating 4.999999978328779
for Someone Else's America (1995)
Predicting rating 4.999995528723899
for Star Kid (1997)
Predicting rating 4.9999934984042165
for Aiqing wansui (1994)
Predicting rating 4.99999223680014
for Prefontaine (1997)
总结:
1、检查梯度的函数checkCostFunction没有实现
2、矩阵内积@用起来比dot简洁