一、SVD
1.直接写
https://blog.csdn.net/qiqi123i/article/details/88855620
简述一下代码过程:
读取数据,划分训练集和测试集
对训练集使用SGD进行训练,得到两个矩阵P,Q,P 的大小是(num(all user),factor),Q的大小是(num(all item),factor)
遍历测试集的每一个user-item对,得到预估的评分,然后与真实评分求RMSE
(仔细看看代码,挺简单的~~~)
import pickle
import numpy as np
import matplotlib.pyplot as plt
class Funk_SVD(object):
"""
implement Funk_SVD
"""
def __init__(self, path, USER_NUM, ITEM_NUM, FACTOR):
super(Funk_SVD, self).__init__() #super() 函数是用于调用父类(超类)的一个方法。 调用Funk_svd的初始化
self.path = path
self.USER_NUM = USER_NUM
self.ITEM_NUM = ITEM_NUM
self.FACTOR = FACTOR
self.init_model()
# 得到训练集和测试集
def load_data(self, flag='train', sep='\t', random_state=0, size=0.8):
'''
flag- train or test
sep- separator of data
random_state- seed of the random
size- rate of the train of the test
'''
np.random.seed(random_state) #当我们设置相同的seed,每次生成的随机数相同。如果不设置seed,则每次会生成不同的随机数
with open(self.path, 'r') as f:
for index, line in enumerate(f):
# print(index,line)
if index == 0:
continue
rand_num = np.random.rand()
if flag == 'train':
if rand_num < size:
u, i, r, t = line.strip('\r\n').split(sep)
yield (int(u) - 1, int(i) - 1, float(r))
#一个带有 yield 的函数就是一个 generator,它和普通函数不同,生成一个 generator 看起来像函数调用,但不会执行任何函数代码,直到对其调用 next()(在 for 循环中会自动调用 next())才开始执行。
else:
if rand_num >= size:
u, i, r, t = line.strip('\r\n').split(sep)
yield (int(u) - 1, int(i) - 1, float(r))
#初始化P,Q矩阵
def init_model(self):
self.P = np.random.rand(self.USER_NUM, self.FACTOR) / (self.FACTOR ** 0.5)
self.Q = np.random.rand(self.ITEM_NUM, self.FACTOR) / (self.FACTOR ** 0.5)
#训练模型
def train(self, epochs=5, theta=1e-4, alpha=0.02, beta=0.02): # 500
'''
train the model
epochs- num of iterations 迭代次数
theta- therehold of iterations 阈值 loss的阈值
alpha- learning rate 学习率
beta- parameter of regularization term 正则项系数
'''
old_e = 0.0
self.cost_of_epoch = []
for epoch in range(epochs): # SGD
print("current epoch is {}".format(epoch))
current_e = 0.0
train_data = self.load_data(flag='train') # reload the train data every iteration(generator)
for index, d in enumerate(train_data):
#print(d) # (250, 11, 4.0)
u, i, r = d
pr = np.dot(self.P[u], self.Q[i])
err = r - pr
current_e += pow(err, 2) # loss term
self.P[u] += alpha * (err * self.Q[i] - beta * self.P[u])
self.Q[i] += alpha * (err * self.P[u] - beta * self.Q[i])
current_e += (beta / 2) * (sum(pow(self.P[u], 2)) + sum(pow(self.Q[i], 2))) # 正则项
self.cost_of_epoch.append(current_e)
print('cost is {}'.format(current_e))
if abs(current_e - old_e) < theta:
break
old_e = current_e
alpha *= 0.9
#预测评分
def predict_rating(self, user_id, item_id):
'''
predict rating for target user of target item
user- the number of user(user_id=xuhao-1)
item- the number of item(item_id=xuhao-1)
'''
pr = np.dot(self.P[user_id], self.Q[item_id])
return pr
#返回排名
def recommand_list(self, user, k=10):
'''
recommand top n for target user
for rating prediction,recommand the items which socre is higer than 4/5 of max socre
'''
user_id = user - 1
user_items = {}
for item_id in range(self.ITEM_NUM):
user_had_look = {}
user_had_look = self.user_had_look_in_train()
if item_id in user_had_look[user]:
continue
pr = self.predict_rating(user_id, item_id)
user_items[item_id] = pr
items = sorted(user_items.items(), key=lambda x: x[1], reverse=True)[:k]
return items
#得到所有训练样本的user 对应的所有items 构成字典
def user_had_look_in_train(self):
user_had_look = {}
train_data = self.load_data(flag='train')
for index, d in enumerate(train_data):
u, i, r = d
user_had_look.setdefault(u, {})
user_had_look[u][i] = r
return user_had_look
#测试集上的rmse
def test_rmse(self):
'''
test the model and return the value of rmse
'''
rmse = .0
num = 0
test_data = self.load_data(flag='test')
for index, d in enumerate(test_data):
num = index + 1
u, i, r = d
pr = np.dot(self.P[u], self.Q[i])
rmse += pow((r - pr), 2)
rmse = (rmse / num) ** 0.5
return rmse
#可视化
def show(self):
'''
show figure for cost and epoch
'''
nums = range(len(self.cost_of_epoch))
plt.plot(nums, self.cost_of_epoch, label='cost value')
plt.xlabel('# of epoch')
plt.ylabel('cost')
plt.legend()
plt.show()
pass
#保存P,Q
def save_model(self):
'''
save the model to pickle,P,Q and rmse
'''
data_dict = {'P': self.P, 'Q': self.Q}
f = open('funk-svd.pkl', 'wb')
pickle.dump(data_dict, f)
pass
#读P,Q
def read_model(self):
'''
reload the model from local disk
'''
f = open('funk-svd.pkl', 'rb')
model = pickle.load(f)
self.P = model['P']
self.Q = model['Q']
pass
if __name__ == "__main__":
mf = Funk_SVD('../data/ml-100k/u.data', 943, 1682, 50) # path,user_num,item_num,factor
mf.train()
mf.save_model()
rmse = mf.test_rmse()
print("rmse:", rmse)
user_items = mf.recommand_list(3)
print(user_items)
2.调库surprise
#-*- coding:utf-8 -*-
# 可以使用上面提到的各种推荐系统算法
from surprise import SVD
from surprise import Dataset, print_perf
from surprise.model_selection import cross_validate
# 默认载入movielens数据集
data = Dataset.load_builtin('ml-100k')
# k折交叉验证(k=3),此方法现已弃用
# data.split(n_folds=3)
# 试一把SVD矩阵分解
algo = SVD()
# 在数据集上测试一下效果
perf = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3)
#输出结果
print_perf(perf)
二、SVD++
主要注意的点是:对ys和N(i)的求解。
把矩阵转变成:
再计算ys和N(i)。每个位置上有值的地方赋值为1来计算
import numpy as np
import random
class SVDPP:
def __init__(self, mat, K=20):
self.mat = np.array(mat)
self.K = K
self.bi = {} #表示物品偏执,对于优质物品,bi高,低质的bi低
self.bu = {} #表示表示用户评分相对于平均值的偏移量,不受物品属性的影响。对于乐观用户ub会偏高,消极用户ub会偏低
self.qi = {}
self.pu = {}
self.avg = np.mean(self.mat[:, 2])
self.y = {}
self.u_dict = {} #保存uid对应的iid
for i in range(self.mat.shape[0]):
uid = self.mat[i, 0] #记录uid
iid = self.mat[i, 1] #记录iid
self.u_dict.setdefault(uid, [])
self.u_dict[uid].append(iid)
self.bi.setdefault(iid, 0)
self.bu.setdefault(uid, 0)
self.qi.setdefault(iid, np.random.random((self.K, 1)) / 10 * np.sqrt(self.K))
self.pu.setdefault(uid, np.random.random((self.K, 1)) / 10 * np.sqrt(self.K))
self.y.setdefault(iid, np.zeros((self.K, 1)) + .1)
#print(self.bi)
def predict(self, uid, iid): # 预测评分的函数
# setdefault的作用是当该用户或者物品未出现过时,新建它的bi,bu,qi,pu及用户评价过的物品u_dict,并设置初始值为0
self.bi.setdefault(iid, 0)
self.bu.setdefault(uid, 0)
self.qi.setdefault(iid, np.zeros((self.K, 1)))
self.pu.setdefault(uid, np.zeros((self.K, 1)))
self.y.setdefault(uid, np.zeros((self.K, 1)))
self.u_dict.setdefault(uid, [])
u_impl_prf, sqrt_Nu = self.getY(uid, iid)
rating = self.avg + self.bi[iid] + self.bu[uid] + np.sum(self.qi[iid] * (self.pu[uid] + u_impl_prf)) # 预测评分公式
# 由于评分范围在1到5,所以当分数大于5或小于1时,返回5,1.
if rating > 5:
rating = 5
if rating < 1:
rating = 1
return rating
# 计算sqrt_Nu和∑yj
def getY(self, uid, iid):
Nu = self.u_dict[uid]
I_Nu = len(Nu)
sqrt_Nu = np.sqrt(I_Nu)
y_u = np.zeros((self.K, 1))
if I_Nu == 0:
u_impl_prf = y_u
else:
for i in Nu:
y_u += self.y[i]
u_impl_prf = y_u / sqrt_Nu
return u_impl_prf, sqrt_Nu
def train(self, steps=3, gamma=0.04, Lambda=0.15): # 训练函数,step为迭代次数。
print('train data size', self.mat.shape) #(80000, 3)
for step in range(steps):
print('step', step + 1, 'is running')
#print(self.mat.shape[0]) #8000
KK = np.random.permutation(self.mat.shape[0]) # 随机梯度下降算法,kk为对矩阵进行随机洗牌
#函数shuffle与permutation都是对原来的数组进行重新洗牌(即随机打乱原来的元素顺序);区别在于shuffle直接在原来的数组上进行操作,改变原来数组的顺序,无返回值。而permutation不直接在原来的数组上进行操作,而是返回一个新的打乱顺序的数组,并不改变原来的数组。
rmse = 0.0
for i in range(self.mat.shape[0]):
j = KK[i]
uid = self.mat[j, 0]
iid = self.mat[j, 1]
rating = self.mat[j, 2]
predict = self.predict(uid, iid)
u_impl_prf, sqrt_Nu = self.getY(uid, iid)
eui = rating - predict
rmse += eui ** 2
self.bu[uid] += gamma * (eui - Lambda * self.bu[uid])
self.bi[iid] += gamma * (eui - Lambda * self.bi[iid])
self.pu[uid] += gamma * (eui * self.qi[iid] - Lambda * self.pu[uid])
self.qi[iid] += gamma * (eui * (self.pu[uid] + u_impl_prf) - Lambda * self.qi[iid])
for j in self.u_dict[uid]:
self.y[j] += gamma * (eui * self.qi[j] / sqrt_Nu - Lambda * self.y[j])
gamma = 0.93 * gamma
print('rmse is', np.sqrt(rmse / self.mat.shape[0]))
def test(self, test_data): # gamma以0.93的学习率递减
test_data = np.array(test_data)
print('test data size', test_data.shape)
rmse = 0.0
for i in range(test_data.shape[0]):
uid = test_data[i, 0]
iid = test_data[i, 1]
rating = test_data[i, 2]
eui = rating - self.predict(uid, iid)
rmse += eui ** 2
print('rmse of test data is', np.sqrt(rmse / test_data.shape[0]))
def getMLData(): # 获取训练集和测试集的函数
import re
f = open("../data/ml-100k/u1.base", 'r')
lines = f.readlines()
f.close()
data = []
for line in lines:
list = re.split('\t|\n', line)
if int(list[2]) != 0:
data.append([int(i) for i in list[:3]])
train_data = data
##print(train_data) #得到数据类型为:[185, 9, 4],
f = open("../data/ml-100k/u1.test", 'r')
lines = f.readlines()
f.close()
data = []
for line in lines:
list = re.split('\t|\n', line)
if int(list[2]) != 0:
data.append([int(i) for i in list[:3]])
test_data = data
#print(test_data)
return train_data, test_data
train_data, test_data = getMLData()
a = SVDPP(train_data, 30)
a.train()
a.test(test_data)
2.调库
要计算y(s),N(i) 运行的很慢
from surprise import SVDpp
from surprise import Dataset, print_perf
from surprise.model_selection import cross_validate
# 默认载入movielens数据集
data = Dataset.load_builtin('ml-100k')
# k折交叉验证(k=3),此方法现已弃用
# data.split(n_folds=3)
# 试一把SVD矩阵分解
algo = SVDpp()
# 在数据集上测试一下效果
perf = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3)
#输出结果
print_perf(perf)