原Github代码地址:https://github.com/bestcurry/Autorec
代码主要分为三部分,数据预处理,autorec部分以及主程序main部分。在代码中我标注出了需要注意的地方以及一些代码与函数的解释。
- 数据预处理部分
import numpy as np
def read_rating(path, num_users, num_items, num_total_ratings, a, b, train_ratio):
# train_ratio :训练集所占的比例
# num_total_ratings :全部评分的数量
fp = open(path + "ratings.dat")
user_train_set = set() # set()会创建一个集合
user_test_set = set()
item_train_set = set()
item_test_set = set()
R = np.zeros((num_users,num_items)) # R是原始评分矩阵
mask_R = np.zeros((num_users, num_items)) # mask_R是隐式评分矩阵
C = np.ones((num_users, num_items)) * b
train_R = np.zeros((num_users, num_items))
test_R = np.zeros((num_users, num_items))
train_mask_R = np.zeros((num_users, num_items))
test_mask_R = np.zeros((num_users, num_items))
random_perm_idx = np.random.permutation(num_total_ratings) # np.random.permutation()会返回随机排列一个序列
train_idx = random_perm_idx[0:int(num_total_ratings*train_ratio)]
test_idx = random_perm_idx[int(num_total_ratings*train_ratio):]
num_train_ratings = len(train_idx)
num_test_ratings = len(test_idx)
lines = fp.readlines()
for line in lines:
user, item, rating, _ = line.split("::")
user_idx = int(user) - 1
item_idx = int(item) - 1
R[user_idx, item_idx] = int(rating)
mask_R[user_idx, item_idx] = 1
C[user_idx, item_idx] = a
''' Train '''
for itr in train_idx:
line = lines[itr]
user, item, rating, _ = line.split("::")
user_idx = int(user) - 1
item_idx = int(item) - 1
train_R[user_idx, item_idx] = int(rating)
train_mask_R[user_idx, item_idx] = 1
user_train_set.add(user_idx) # 存储有评分记录的用户
item_train_set.add(item_idx) # 存储有评分记录的物品
''' Test '''
for itr in test_idx:
line = lines[itr]
user, item, rating, _ = line.split("::")
user_idx = int(user) - 1
item_idx = int(item) - 1
test_R[user_idx, item_idx] = int(rating)
test_mask_R[user_idx, item_idx] = 1
user_test_set.add(user_idx)
item_test_set.add(item_idx)
return R, mask_R, C, train_R, train_mask_R, test_R, test_mask_R, num_train_ratings, num_test_ratings, user_train_set, item_train_set, user_test_set, item_test_set
- autorec部分
import tensorflow as tf
import time
import numpy as np
import os
import math
class AutoRec: #创建类
def __init__(self, sess, args,
num_users, num_items,
R, mask_R, C, train_R, train_mask_R, test_R, test_mask_R, num_train_ratings, num_test_ratings,
user_train_set, item_train_set, user_test_set, item_test_set,
result_path ):
self.sess = sess #?
self.args = args #?
self.num_users = num_users # 用户数量
self.num_items = num_items # 物品数量
self.R = R # 评分矩阵
self.mask_R = mask_R # 隐式评分矩阵
self.C = C #?
self.train_R = train_R
self.train_mask_R = train_mask_R
self.test_R = test_R
self.test_mask_R = test_mask_R
self.num_train_ratings = num_train_ratings
self.num_test_ratings = num_test_ratings
self.user_train_set = user_train_set
self.item_train_set = item_train_set
self.user_test_set = user_test_set
self.item_test_set = item_test_set
self.hidden_neuron = args.hidden_neuron
self.train_epoch = args.train_epoch
self.batch_size = args.batch_size
self.num_batch = int(math.ceil(self.num_users / float(self.batch_size))) # 用户数除以batch_size
self.base_lr = args.base_lr # 基础学习率
self.optimizer_method = args.optimizer_method
self.display_step = args.display_step # 多少代打印一次训练时的损失
self.random_seed = args.random_seed
self.global_step = tf.Variable(0, trainable=False)
self.decay_epoch_step = args.decay_epoch_step
self.decay_step = self.decay_epoch_step * self.num_batch # 指数衰减法的衰减速度
self.lr = tf.train.exponential_decay(self.base_lr, self.global_step,
self.decay_step, 0.96, staircase=True) # 指数衰减
self.lambda_value = args.lambda_value
self.train_cost_list = []
self.test_cost_list = []
self.test_rmse_list = []
self.result_path = result_path
self.grad_clip = args.grad_clip
def run(self):
self.prepare_model()
init = tf.global_variables_initializer()
self.sess.run(init)
for epoch_itr in range(self.train_epoch):
self.train_model(epoch_itr)
self.test_model(epoch_itr)
self.make_records()
def prepare_model(self):
self.input_R = tf.placeholder(dtype=tf.float32, shape=[None, self.num_items], name="input_R")
self.input_mask_R = tf.placeholder(dtype=tf.float32, shape=[None, self.num_items], name="input_mask_R")
V = tf.get_variable(name="V", initializer=tf.truncated_normal(shape=[self.num_items, self.hidden_neuron],
mean=0, stddev=0.03),dtype=tf.float32)
W = tf.get_variable(name="W", initializer=tf.truncated_normal(shape=[self.hidden_neuron, self.num_items],
mean=0, stddev=0.03),dtype=tf.float32)
mu = tf.get_variable(name="mu", initializer=tf.zeros(shape=self.hidden_neuron),dtype=tf.float32)
b = tf.get_variable(name="b", initializer=tf.zeros(shape=self.num_items), dtype=tf.float32)
pre_Encoder = tf.matmul(self.input_R,V) + mu
self.Encoder = tf.nn.sigmoid(pre_Encoder)
pre_Decoder = tf.matmul(self.Encoder,W) + b
self.Decoder = tf.identity(pre_Decoder) # tf.identity 返回一个与参数相同的tensor
pre_rec_cost = tf.multiply((self.input_R - self.Decoder) , self.input_mask_R) # 矩阵对应元素相乘
rec_cost = tf.square(self.l2_norm(pre_rec_cost))
pre_reg_cost = tf.square(self.l2_norm(W)) + tf.square(self.l2_norm(V)) # 正则化
reg_cost = self.lambda_value * 0.5 * pre_reg_cost
self.cost = rec_cost + reg_cost
if self.optimizer_method == "Adam":
optimizer = tf.train.AdamOptimizer(self.lr)
elif self.optimizer_method == "RMSProp":
optimizer = tf.train.RMSPropOptimizer(self.lr)
else:
raise ValueError("Optimizer Key ERROR")
if self.grad_clip:
gvs = optimizer.compute_gradients(self.cost)
capped_gvs = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gvs]
self.optimizer = optimizer.apply_gradients(capped_gvs, global_step=self.global_step)
else:
self.optimizer = optimizer.minimize(self.cost, global_step=self.global_step)
# train_model()运行一次将样本遍历一次
def train_model(self, itr):
start_time = time.time()
random_perm_doc_idx = np.random.permutation(self.num_users) # 将用户序列随机排列
batch_cost = 0
for i in range(self.num_batch):
if i == self.num_batch - 1:
batch_set_idx = random_perm_doc_idx[i * self.batch_size:]
elif i < self.num_batch - 1:
batch_set_idx = random_perm_doc_idx[i * self.batch_size : (i+1) * self.batch_size]
_, Cost = self.sess.run(
[self.optimizer, self.cost],
feed_dict={self.input_R: self.train_R[batch_set_idx, :],
self.input_mask_R: self.train_mask_R[batch_set_idx, :]})
batch_cost = batch_cost + Cost
self.train_cost_list.append(batch_cost)
if (itr+1) % self.display_step == 0:
print ("Training //", "Epoch %d //" % (itr), " Total cost = {:.2f}".format(batch_cost),
"Elapsed time : %d sec" % (time.time() - start_time))
def test_model(self,itr):
start_time = time.time()
Cost, Decoder = self.sess.run(
[self.cost,self.Decoder],
feed_dict={self.input_R: self.test_R,
self.input_mask_R: self.test_mask_R})
self.test_cost_list.append(Cost)
if (itr+1) % self.display_step == 0:
Estimated_R = Decoder.clip(min=1, max=5) # np.clip() 大于max的用5替代,小于1的用1替代
unseen_user_test_list = list(self.user_test_set - self.user_train_set) # 在测试集中的,但没有在训练集中出现的用户索引
unseen_item_test_list = list(self.item_test_set - self.item_train_set) # 在测试集中的,但没有在训练集中出现的物品索引
for user in unseen_user_test_list:
for item in unseen_item_test_list:
if self.test_mask_R[user,item] == 1: # exist in test set 在测试集中此用户对此物品存在评分
Estimated_R[user, item] = 3 # 未观测到的评分默认预测为3
pre_numerator = np.multiply((Estimated_R - self.test_R), self.test_mask_R)
numerator = np.sum(np.square(pre_numerator))
denominator = self.num_test_ratings
RMSE = np.sqrt(numerator / float(denominator))
self.test_rmse_list.append(RMSE)
print ("Testing //", "Epoch %d //" % (itr), " Total cost = {:.2f}".format(Cost), " RMSE = {:.5f}".format(RMSE),
"Elapsed time : %d sec" % (time.time() - start_time))
print ("=" * 100)
def make_records(self): # 记录
if not os.path.exists(self.result_path):
os.makedirs(self.result_path)
basic_info = self.result_path + "basic_info.txt"
train_record = self.result_path + "train_record.txt"
test_record = self.result_path + "test_record.txt"
with open (train_record,'w') as f:
f.write(str("Cost:"))
f.write('\t')
for itr in range(len(self.train_cost_list)):
f.write(str(self.train_cost_list[itr]))
f.write('\t')
f.write('\n')
with open (test_record,'w') as g:
g.write(str("Cost:"))
g.write('\t')
for itr in range(len(self.test_cost_list)):
g.write(str(self.test_cost_list[itr]))
g.write('\t')
g.write('\n')
g.write(str("RMSE:"))
for itr in range(len(self.test_rmse_list)):
g.write(str(self.test_rmse_list[itr]))
g.write('\t')
g.write('\n')
with open(basic_info,'w') as h:
h.write(str(self.args))
def l2_norm(self, tensor):
return tf.sqrt(tf.reduce_sum(tf.square(tensor)))
- main
from data_preprocessor import read_rating
from AutoRec import AutoRec
import tensorflow as tf
import time
import argparse
import numpy as np
current_time = time.time()
parser = argparse.ArgumentParser(description='I-AutoRec ')
parser.add_argument('--hidden_neuron', type=int, default=500)
parser.add_argument('--lambda_value', type=float, default=1)
parser.add_argument('--train_epoch', type=int, default=100)
parser.add_argument('--batch_size', type=int,default=100)
parser.add_argument('--optimizer_method', choices=['Adam','RMSProp'],default='Adam')
parser.add_argument('--grad_clip', type=bool,default=False)
parser.add_argument('--base_lr', type=float, default=1e-3)
parser.add_argument('--decay_epoch_step', type=int, default=50,help="decay the learning rate for each n epochs")
parser.add_argument('--random_seed', type=int, default=1000)
parser.add_argument('--display_step', type=int, default=1)
args = parser.parse_args()
tf.set_random_seed(args.random_seed)
np.random.seed(args.random_seed)
data_name = 'ml-1m'; num_users = 6040; num_items = 3952; num_total_ratings = 1000209; train_ratio = 0.9
path = "./data/%s" % data_name + "/"
result_path = './results/' + data_name + '/' + str(args.random_seed) + '_' + str(args.optimizer_method) + '_' + str(args.base_lr) + "_" + str(current_time)+"/"
R, mask_R, C, train_R, train_mask_R, test_R, test_mask_R,num_train_ratings,num_test_ratings,\
user_train_set,item_train_set,user_test_set,item_test_set \
= read_rating(path, num_users, num_items,num_total_ratings, 1, 0, train_ratio)
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
with tf.Session(config=config) as sess:
AutoRec = AutoRec(sess,args,
num_users, num_items,
R, mask_R, C, train_R, train_mask_R, test_R, test_mask_R,num_train_ratings,num_test_ratings,
user_train_set, item_train_set, user_test_set, item_test_set,
result_path)
AutoRec.run()
设置epoch为100,训练结果:
代码学习总结:
- argparse的使用。 argparse是python自带的命令行参数解析包,可以用来方便地读取命令行参数。比较简单。
- tensorflow搭建自编码器。
- 新的处理movielens的代码,更加的简单易读。