参考链接:吴恩达|机器学习作业8.1.推荐系统(协同过滤算法)_学吧学吧终成学霸的博客-CSDN博客
import numpy as np
from scipy import io
from scipy import optimize as opt
#1.读取数据集
dt0 = io.loadmat("E:\homework\作业8-异常检测与推荐系统\data\ex8_movies.mat")
y = dt0["Y"] #(1682, 943)
R = dt0["R"] #(1682, 943)
dt1 = io.loadmat("E:\homework\作业8-异常检测与推荐系统\data\ex8_movieParams.mat")
x = dt1["X"] #(1682, 10)1682个电影的特征值
theta = dt1["Theta"] #(943, 10)943个用户的特征值
num_movies = y.shape[0] #1682
num_users = y.shape[1] #943
num_features = x.shape[1]
#2.定义代价函数并测试
def cofiCostFunc(xAndTheta,y,r,numMovies,numUsers,numFeatures,lamda):
x_temp = np.reshape(xAndTheta[:numMovies*numFeatures],(numMovies , numFeatures)) #(1682, 10)
theta_temp =np.reshape(xAndTheta[numMovies*numFeatures:],(numUsers, numFeatures)) #(943, 10)
p_temp = x_temp @ theta_temp.T #(1682, 943)
#这里计算的代价函数只包含r=1的情况
result = np.power((p_temp-y),2) * r
reg = 0.5 * lamda * (np.sum(np.power(theta_temp,2)) + np.sum(np.power(x_temp,2)))
return 0.5 * np.sum(result) + reg
"""#测试小数据集上的误差
small_users = 4
small_movies = 5
small_features = 3
small_x = x[0:small_movies, 0:small_features]
small_theta = theta[0:small_users, 0:small_features]
small_y = y[0:small_movies, 0:small_users]
small_r = R[0:small_movies, 0:small_users]
x_and_theta = np.concatenate((small_x.ravel(),small_theta.ravel()),0)
result = cofiCostFunc(x_and_theta,small_y,small_r,small_movies,small_users,small_features,1.5)
#没有正则化时,22.224603725685675
#正则化参数lamda = 1.5时,31.34405624427422
"""
#3.定义梯度函数
def cofiGradient(xAndTheta,y,r,numMovies,numUsers,numFeatures,lamda):
x_temp = np.reshape(xAndTheta[:numMovies * numFeatures], (numMovies, numFeatures)) # (1682, 10)
theta_temp = np.reshape(xAndTheta[numMovies * numFeatures:], (numUsers, numFeatures)) # (943, 10)
p_temp = x_temp @ theta_temp.T # (1682, 943)
# 这里计算的梯度函数只包含r=1的情况
gradient_temp = (p_temp - y) * r ## (1682, 943)
x_gradient = gradient_temp @ theta_temp + lamda * x_temp
theta_gradient = gradient_temp.T @ x_temp + lamda * theta_temp
return np.concatenate((x_gradient.ravel(), theta_gradient.ravel()), 0)
#4.新用户进行评分
# 电影名称的文件里有一些电影好像是法文电影,所以编码上有些问题,会出现UnicodeDecodeError,所以进行以下步骤
# 检查文件中哪些字符的编码转不过来
"""f = open('E:\homework\作业8-异常检测与推荐系统\data\movie_ids.txt', "rb") # 二进制格式读文件
lines = f.readlines()
for line in lines:
try:
line.decode('utf8') # 解码
except:
# 打印编码有问题的行
print(str(line))
f.close"""
# 修改那些好像是法文的字符,文件另存为movie_ids_mod.txt,再进行电影列表读入
t = open('E:\homework\作业8-异常检测与推荐系统\data\movieList.txt')
movie_list = [] # 可得到长度为1682的电影列表
for line in t:
# 先用strip去掉开头结尾的空格,然后用split对空格切片,选取从编号之后的所有字符串,再用jion空格连接成一个字符串
movie_list.append(' '.join(line.strip().split(' ')[1:]))
t.close
#初始化我的评分
my_ratings = np.zeros((1682, 1))
#添加电影评分,注意这里的索引比作业中少1,从0开始的
my_ratings[0] = 4
my_ratings[97] = 2
my_ratings[6] = 3
my_ratings[11] = 5
my_ratings[53] = 4
my_ratings[63] = 5
my_ratings[65] = 3
my_ratings[68] = 5
my_ratings[182] = 4
my_ratings[225] = 5
my_ratings[354] = 5
#6.训练协同过滤算法
y_new = np.concatenate((my_ratings, y),1) #(1682, 944)
R_new = np.concatenate(((my_ratings != 0), R),1) #(1682, 944)
def normalizeRatings(y, r):
#为了下面可以直接矩阵相减,将(1682,)reshape成(1682,1)
mu = (np.sum(y, axis=1)/np.sum(r, axis=1)).reshape((len(y),1))
y_norm = (y - mu)*r #未评分的依然为0
return y_norm, mu
#标准化
y_norm, y_mean = normalizeRatings(y_new, R_new)
new_users = y_new.shape[1]
new_movies = y_new.shape[0]
new_features = 10
#随机初始化参数
new_x = np.random.randn(new_movies, new_features)
new_theta = np.random.randn(new_users, new_features)
initial_parameters = np.concatenate((new_x.ravel(), new_theta.ravel()),0)
#训练模型
lmd = 10
res = opt.minimize(fun=cofiCostFunc,
x0=initial_parameters,
args=(y_new, R_new, new_movies,new_users, new_features, lmd),
method='TNC',
jac=cofiGradient,
options={'maxiter': 100})
#得到模型参数
params = res.x
x_result = np.reshape(params[0:new_movies*new_features], (new_movies,new_features))
theta_result = np.reshape(params[new_movies*new_features:], (new_users,new_features))
#7 预测评分并推荐
#预测的评分矩阵
p = x_result @ theta_result.T
#我的预测评分
my_predictions = (p[:,0].reshape(len(p),1) + y_mean ).flatten() #为了矩阵相加,后面展开是为了打印方便
#为我推荐的10部电影
ix = np.argsort(my_predictions)[::-1] #逆序,由大到小得到索引
print('Top recommendations for you:')
for i in range(10):
j = ix[i]
print('Predicting rating %.1f for movie %s' %(my_predictions[j], movie_list[j]))
#我原来的评分
print(" ")
print('Original ratings provided:')
for i in range(len(my_ratings)):
if my_ratings[i] > 0:
print('Rated %d for %s' %(my_ratings[i], movie_list[i]))