Autoencoders Meet Collaborative Filtering 代码实战

原Github代码地址:https://github.com/bestcurry/Autorec

代码主要分为三部分,数据预处理,autorec部分以及主程序main部分。在代码中我标注出了需要注意的地方以及一些代码与函数的解释。
在这里插入图片描述

  1. 数据预处理部分
import numpy as np

def read_rating(path, num_users, num_items, num_total_ratings, a, b, train_ratio):

    # train_ratio :训练集所占的比例
    # num_total_ratings :全部评分的数量

    fp = open(path + "ratings.dat")

    user_train_set = set()  # set()会创建一个集合
    user_test_set = set()
    item_train_set = set()
    item_test_set = set()

    R = np.zeros((num_users,num_items))  # R是原始评分矩阵
    mask_R = np.zeros((num_users, num_items))  # mask_R是隐式评分矩阵
    C = np.ones((num_users, num_items)) * b

    train_R = np.zeros((num_users, num_items))
    test_R = np.zeros((num_users, num_items))

    train_mask_R = np.zeros((num_users, num_items))
    test_mask_R = np.zeros((num_users, num_items))

    random_perm_idx = np.random.permutation(num_total_ratings)  # np.random.permutation()会返回随机排列一个序列
    train_idx = random_perm_idx[0:int(num_total_ratings*train_ratio)]
    test_idx = random_perm_idx[int(num_total_ratings*train_ratio):]

    num_train_ratings = len(train_idx)
    num_test_ratings = len(test_idx)

    lines = fp.readlines()
    for line in lines:
        user, item, rating, _ = line.split("::")
        user_idx = int(user) - 1
        item_idx = int(item) - 1
        R[user_idx, item_idx] = int(rating)
        mask_R[user_idx, item_idx] = 1
        C[user_idx, item_idx] = a

    ''' Train '''
    for itr in train_idx:
        line = lines[itr]
        user, item, rating, _ = line.split("::")
        user_idx = int(user) - 1
        item_idx = int(item) - 1
        train_R[user_idx, item_idx] = int(rating)
        train_mask_R[user_idx, item_idx] = 1

        user_train_set.add(user_idx)  # 存储有评分记录的用户
        item_train_set.add(item_idx)  # 存储有评分记录的物品

    ''' Test '''
    for itr in test_idx:
        line = lines[itr]
        user, item, rating, _ = line.split("::")
        user_idx = int(user) - 1
        item_idx = int(item) - 1
        test_R[user_idx, item_idx] = int(rating)
        test_mask_R[user_idx, item_idx] = 1

        user_test_set.add(user_idx)
        item_test_set.add(item_idx)

    return R, mask_R, C, train_R, train_mask_R, test_R, test_mask_R, num_train_ratings, num_test_ratings, user_train_set, item_train_set, user_test_set, item_test_set

  1. autorec部分
import tensorflow as tf
import time
import numpy as np
import os
import math


class AutoRec:  #创建类
    def __init__(self, sess, args,
                       num_users, num_items,
                       R, mask_R, C, train_R, train_mask_R, test_R, test_mask_R, num_train_ratings, num_test_ratings,
                       user_train_set, item_train_set, user_test_set, item_test_set,
                       result_path ):

        self.sess = sess  #?
        self.args = args  #?

        self.num_users = num_users  # 用户数量
        self.num_items = num_items  # 物品数量

        self.R = R  # 评分矩阵
        self.mask_R = mask_R  # 隐式评分矩阵
        self.C = C  #?
        self.train_R = train_R
        self.train_mask_R = train_mask_R
        self.test_R = test_R
        self.test_mask_R = test_mask_R
        self.num_train_ratings = num_train_ratings
        self.num_test_ratings = num_test_ratings

        self.user_train_set = user_train_set
        self.item_train_set = item_train_set
        self.user_test_set = user_test_set
        self.item_test_set = item_test_set

        self.hidden_neuron = args.hidden_neuron  
        self.train_epoch = args.train_epoch  
        self.batch_size = args.batch_size  
        self.num_batch = int(math.ceil(self.num_users / float(self.batch_size)))  # 用户数除以batch_size

        self.base_lr = args.base_lr  # 基础学习率
        self.optimizer_method = args.optimizer_method
        self.display_step = args.display_step  # 多少代打印一次训练时的损失
        self.random_seed = args.random_seed

        self.global_step = tf.Variable(0, trainable=False)
        self.decay_epoch_step = args.decay_epoch_step
        self.decay_step = self.decay_epoch_step * self.num_batch  # 指数衰减法的衰减速度
        self.lr = tf.train.exponential_decay(self.base_lr, self.global_step,
                                                   self.decay_step, 0.96, staircase=True) # 指数衰减
        self.lambda_value = args.lambda_value

        self.train_cost_list = []
        self.test_cost_list = []
        self.test_rmse_list = []

        self.result_path = result_path
        self.grad_clip = args.grad_clip

    def run(self):
        self.prepare_model()
        init = tf.global_variables_initializer()
        self.sess.run(init)
        for epoch_itr in range(self.train_epoch):
            self.train_model(epoch_itr)
            self.test_model(epoch_itr)
        self.make_records()

    def prepare_model(self):
        self.input_R = tf.placeholder(dtype=tf.float32, shape=[None, self.num_items], name="input_R")
        self.input_mask_R = tf.placeholder(dtype=tf.float32, shape=[None, self.num_items], name="input_mask_R")

        V = tf.get_variable(name="V", initializer=tf.truncated_normal(shape=[self.num_items, self.hidden_neuron],
                                         mean=0, stddev=0.03),dtype=tf.float32)
        W = tf.get_variable(name="W", initializer=tf.truncated_normal(shape=[self.hidden_neuron, self.num_items],
                                         mean=0, stddev=0.03),dtype=tf.float32)
        mu = tf.get_variable(name="mu", initializer=tf.zeros(shape=self.hidden_neuron),dtype=tf.float32)
        b = tf.get_variable(name="b", initializer=tf.zeros(shape=self.num_items), dtype=tf.float32)

        pre_Encoder = tf.matmul(self.input_R,V) + mu
        self.Encoder = tf.nn.sigmoid(pre_Encoder)
        pre_Decoder = tf.matmul(self.Encoder,W) + b
        self.Decoder = tf.identity(pre_Decoder)  # tf.identity 返回一个与参数相同的tensor

        pre_rec_cost = tf.multiply((self.input_R - self.Decoder) , self.input_mask_R)  # 矩阵对应元素相乘
        rec_cost = tf.square(self.l2_norm(pre_rec_cost))
        pre_reg_cost = tf.square(self.l2_norm(W)) + tf.square(self.l2_norm(V))  # 正则化
        reg_cost = self.lambda_value * 0.5 * pre_reg_cost   

        self.cost = rec_cost + reg_cost

        if self.optimizer_method == "Adam":
            optimizer = tf.train.AdamOptimizer(self.lr)
        elif self.optimizer_method == "RMSProp":
            optimizer = tf.train.RMSPropOptimizer(self.lr)
        else:
            raise ValueError("Optimizer Key ERROR")

        if self.grad_clip:
            gvs = optimizer.compute_gradients(self.cost)
            capped_gvs = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gvs]
            self.optimizer = optimizer.apply_gradients(capped_gvs, global_step=self.global_step)
        else:
            self.optimizer = optimizer.minimize(self.cost, global_step=self.global_step)

    # train_model()运行一次将样本遍历一次
    def train_model(self, itr):
        start_time = time.time()
        random_perm_doc_idx = np.random.permutation(self.num_users)  # 将用户序列随机排列

        batch_cost = 0
        for i in range(self.num_batch):
            if i == self.num_batch - 1:
                batch_set_idx = random_perm_doc_idx[i * self.batch_size:]
            elif i < self.num_batch - 1:
                batch_set_idx = random_perm_doc_idx[i * self.batch_size : (i+1) * self.batch_size]

            _, Cost = self.sess.run(
                [self.optimizer, self.cost],
                feed_dict={self.input_R: self.train_R[batch_set_idx, :],
                           self.input_mask_R: self.train_mask_R[batch_set_idx, :]})

            batch_cost = batch_cost + Cost
        self.train_cost_list.append(batch_cost)

        if (itr+1) % self.display_step == 0:
            print ("Training //", "Epoch %d //" % (itr), " Total cost = {:.2f}".format(batch_cost),
               "Elapsed time : %d sec" % (time.time() - start_time))

    def test_model(self,itr):
        start_time = time.time()
        Cost, Decoder = self.sess.run(
            [self.cost,self.Decoder],
            feed_dict={self.input_R: self.test_R,
                       self.input_mask_R: self.test_mask_R})

        self.test_cost_list.append(Cost)

        if (itr+1) % self.display_step == 0:
            Estimated_R = Decoder.clip(min=1, max=5)  # np.clip() 大于max的用5替代,小于1的用1替代
            unseen_user_test_list = list(self.user_test_set - self.user_train_set)  # 在测试集中的,但没有在训练集中出现的用户索引
            unseen_item_test_list = list(self.item_test_set - self.item_train_set)  # 在测试集中的,但没有在训练集中出现的物品索引

            for user in unseen_user_test_list:
                for item in unseen_item_test_list:
                    if self.test_mask_R[user,item] == 1:  # exist in test set 在测试集中此用户对此物品存在评分
                        Estimated_R[user, item] = 3  # 未观测到的评分默认预测为3

            pre_numerator = np.multiply((Estimated_R - self.test_R), self.test_mask_R)
            numerator = np.sum(np.square(pre_numerator))
            denominator = self.num_test_ratings
            RMSE = np.sqrt(numerator / float(denominator))

            self.test_rmse_list.append(RMSE)

            print ("Testing //", "Epoch %d //" % (itr), " Total cost = {:.2f}".format(Cost), " RMSE = {:.5f}".format(RMSE),
                   "Elapsed time : %d sec" % (time.time() - start_time))
            print ("=" * 100)

    def make_records(self):  # 记录
        if not os.path.exists(self.result_path):
            os.makedirs(self.result_path)

        basic_info = self.result_path + "basic_info.txt"
        train_record = self.result_path + "train_record.txt"
        test_record = self.result_path + "test_record.txt"

        with open (train_record,'w') as f:
            f.write(str("Cost:"))
            f.write('\t')
            for itr in range(len(self.train_cost_list)):
                f.write(str(self.train_cost_list[itr]))
                f.write('\t')
            f.write('\n')

        with open (test_record,'w') as g:
            g.write(str("Cost:"))
            g.write('\t')
            for itr in range(len(self.test_cost_list)):
                g.write(str(self.test_cost_list[itr]))
                g.write('\t')
            g.write('\n')

            g.write(str("RMSE:"))
            for itr in range(len(self.test_rmse_list)):
                g.write(str(self.test_rmse_list[itr]))
                g.write('\t')
            g.write('\n')

        with open(basic_info,'w') as h:
            h.write(str(self.args))

    def l2_norm(self, tensor):
        return tf.sqrt(tf.reduce_sum(tf.square(tensor)))
  1. main
from data_preprocessor import read_rating
from AutoRec import AutoRec
import tensorflow as tf
import time
import argparse
import numpy as np
current_time = time.time()

parser = argparse.ArgumentParser(description='I-AutoRec ')
parser.add_argument('--hidden_neuron', type=int, default=500)
parser.add_argument('--lambda_value', type=float, default=1)

parser.add_argument('--train_epoch', type=int, default=100)
parser.add_argument('--batch_size', type=int,default=100)

parser.add_argument('--optimizer_method', choices=['Adam','RMSProp'],default='Adam')
parser.add_argument('--grad_clip', type=bool,default=False)
parser.add_argument('--base_lr', type=float, default=1e-3)
parser.add_argument('--decay_epoch_step', type=int, default=50,help="decay the learning rate for each n epochs")

parser.add_argument('--random_seed', type=int, default=1000)
parser.add_argument('--display_step', type=int, default=1)

args = parser.parse_args()
tf.set_random_seed(args.random_seed)
np.random.seed(args.random_seed)

data_name = 'ml-1m'; num_users = 6040; num_items = 3952; num_total_ratings = 1000209; train_ratio = 0.9
path = "./data/%s" % data_name + "/"

result_path = './results/' + data_name + '/' + str(args.random_seed) + '_' + str(args.optimizer_method) + '_' + str(args.base_lr) + "_" + str(current_time)+"/"
R, mask_R, C, train_R, train_mask_R, test_R, test_mask_R,num_train_ratings,num_test_ratings,\
user_train_set,item_train_set,user_test_set,item_test_set \
    = read_rating(path, num_users, num_items,num_total_ratings, 1, 0, train_ratio)

config = tf.ConfigProto()
config.gpu_options.allow_growth=True
with tf.Session(config=config) as sess:
    AutoRec = AutoRec(sess,args,
                      num_users, num_items,
                      R, mask_R, C, train_R, train_mask_R, test_R, test_mask_R,num_train_ratings,num_test_ratings,
                      user_train_set, item_train_set, user_test_set, item_test_set,
                      result_path)
    AutoRec.run()

设置epoch为100,训练结果:
在这里插入图片描述
代码学习总结:

  1. argparse的使用。 argparse是python自带的命令行参数解析包,可以用来方便地读取命令行参数。比较简单。
  2. tensorflow搭建自编码器。
  3. 新的处理movielens的代码,更加的简单易读。
  • 6
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值