吴恩达机器学习python作业之推荐系统

最新推荐文章于 2024-05-10 00:37:51 发布

abcd1233463457347

最新推荐文章于 2024-05-10 00:37:51 发布

阅读量312

点赞数

分类专栏：吴恩达机器学习作业文章标签： python 推荐算法

本文链接：https://blog.csdn.net/Ariya1234/article/details/127641907

版权

吴恩达机器学习作业专栏收录该内容

14 篇文章 0 订阅

订阅专栏

参考链接：吴恩达|机器学习作业8.1.推荐系统（协同过滤算法）_学吧学吧终成学霸的博客-CSDN博客

在这里插入图片描述

import numpy as np
from scipy import io
from scipy import optimize as opt

#1.读取数据集
dt0 = io.loadmat("E:\homework\作业8-异常检测与推荐系统\data\ex8_movies.mat")
y = dt0["Y"] #(1682, 943)
R = dt0["R"] #(1682, 943)


dt1 = io.loadmat("E:\homework\作业8-异常检测与推荐系统\data\ex8_movieParams.mat")
x = dt1["X"] #(1682, 10)1682个电影的特征值
theta = dt1["Theta"] #(943, 10)943个用户的特征值
num_movies = y.shape[0] #1682
num_users = y.shape[1] #943
num_features = x.shape[1]

#2.定义代价函数并测试
def cofiCostFunc(xAndTheta,y,r,numMovies,numUsers,numFeatures,lamda):
    x_temp = np.reshape(xAndTheta[:numMovies*numFeatures],(numMovies , numFeatures)) #(1682, 10)
    theta_temp =np.reshape(xAndTheta[numMovies*numFeatures:],(numUsers, numFeatures)) #(943, 10)
    p_temp = x_temp @ theta_temp.T #(1682, 943)
    #这里计算的代价函数只包含r=1的情况
    result = np.power((p_temp-y),2) * r
    reg = 0.5 * lamda * (np.sum(np.power(theta_temp,2)) + np.sum(np.power(x_temp,2)))
    return 0.5 * np.sum(result) + reg


"""#测试小数据集上的误差
small_users = 4
small_movies = 5
small_features = 3
small_x = x[0:small_movies, 0:small_features]
small_theta = theta[0:small_users, 0:small_features]
small_y = y[0:small_movies, 0:small_users]
small_r = R[0:small_movies, 0:small_users]


x_and_theta = np.concatenate((small_x.ravel(),small_theta.ravel()),0)
result = cofiCostFunc(x_and_theta,small_y,small_r,small_movies,small_users,small_features,1.5)
#没有正则化时，22.224603725685675
#正则化参数lamda = 1.5时，31.34405624427422
"""

#3.定义梯度函数
def cofiGradient(xAndTheta,y,r,numMovies,numUsers,numFeatures,lamda):
    x_temp = np.reshape(xAndTheta[:numMovies * numFeatures], (numMovies, numFeatures))  # (1682, 10)
    theta_temp = np.reshape(xAndTheta[numMovies * numFeatures:], (numUsers, numFeatures))  # (943, 10)
    p_temp = x_temp @ theta_temp.T  # (1682, 943)
    # 这里计算的梯度函数只包含r=1的情况
    gradient_temp = (p_temp - y) * r ## (1682, 943)
    x_gradient = gradient_temp @ theta_temp + lamda * x_temp
    theta_gradient = gradient_temp.T @ x_temp + lamda * theta_temp
    return np.concatenate((x_gradient.ravel(), theta_gradient.ravel()), 0)


#4.新用户进行评分
# 电影名称的文件里有一些电影好像是法文电影，所以编码上有些问题，会出现UnicodeDecodeError，所以进行以下步骤
# 检查文件中哪些字符的编码转不过来
"""f = open('E:\homework\作业8-异常检测与推荐系统\data\movie_ids.txt', "rb")  # 二进制格式读文件
lines = f.readlines()
for line in lines:
    try:
        line.decode('utf8')  # 解码
    except:
        # 打印编码有问题的行
        print(str(line))
f.close"""

# 修改那些好像是法文的字符，文件另存为movie_ids_mod.txt，再进行电影列表读入
t = open('E:\homework\作业8-异常检测与推荐系统\data\movieList.txt')
movie_list = [] # 可得到长度为1682的电影列表
for line in t:
    # 先用strip去掉开头结尾的空格，然后用split对空格切片，选取从编号之后的所有字符串，再用jion空格连接成一个字符串
    movie_list.append(' '.join(line.strip().split(' ')[1:]))
t.close



#初始化我的评分
my_ratings = np.zeros((1682, 1))

#添加电影评分，注意这里的索引比作业中少1，从0开始的
my_ratings[0] = 4
my_ratings[97] = 2
my_ratings[6] = 3
my_ratings[11] = 5
my_ratings[53] = 4
my_ratings[63] = 5
my_ratings[65] = 3
my_ratings[68] = 5
my_ratings[182] = 4
my_ratings[225] = 5
my_ratings[354] = 5



#6.训练协同过滤算法
y_new = np.concatenate((my_ratings, y),1) #(1682, 944)
R_new = np.concatenate(((my_ratings != 0), R),1) #(1682, 944)


def normalizeRatings(y, r):
    #为了下面可以直接矩阵相减，将(1682,)reshape成(1682,1)
    mu = (np.sum(y, axis=1)/np.sum(r, axis=1)).reshape((len(y),1))
    y_norm = (y - mu)*r #未评分的依然为0
    return y_norm, mu

#标准化
y_norm, y_mean = normalizeRatings(y_new, R_new)
new_users = y_new.shape[1]
new_movies = y_new.shape[0]
new_features = 10

#随机初始化参数
new_x = np.random.randn(new_movies, new_features)
new_theta = np.random.randn(new_users, new_features)
initial_parameters = np.concatenate((new_x.ravel(), new_theta.ravel()),0)

#训练模型
lmd = 10
res = opt.minimize(fun=cofiCostFunc,
                   x0=initial_parameters,
                   args=(y_new, R_new,  new_movies,new_users, new_features, lmd),
                   method='TNC',
                   jac=cofiGradient,
                   options={'maxiter': 100})

#得到模型参数
params = res.x
x_result = np.reshape(params[0:new_movies*new_features], (new_movies,new_features))
theta_result = np.reshape(params[new_movies*new_features:], (new_users,new_features))


#7 预测评分并推荐
#预测的评分矩阵
p = x_result @ theta_result.T
#我的预测评分
my_predictions = (p[:,0].reshape(len(p),1) + y_mean ).flatten() #为了矩阵相加，后面展开是为了打印方便

#为我推荐的10部电影
ix = np.argsort(my_predictions)[::-1] #逆序，由大到小得到索引
print('Top recommendations for you:')
for i in range(10):
    j = ix[i]
    print('Predicting rating %.1f for movie %s' %(my_predictions[j], movie_list[j]))

#我原来的评分
print(" ")
print('Original ratings provided:')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0:
        print('Rated %d for %s' %(my_ratings[i], movie_list[i]))