推荐算法学习2-MXNET 实现movielen 融合个性化推荐-续-加入CNN文本处理

本文在上篇推荐系统的基础上,利用MXNET的CNN技术对电影标题进行特征提取。通过预处理将标题转化为固定长度的词向量,以适应卷积操作。经过多轮训练,模型的RMSE指标逐步降低,表明CNN处理对推荐效果有所提升。
摘要由CSDN通过智能技术生成

上一篇文章的网络定义严格来说并不是严格按照paddle2给的那个例子来的。本篇文章在上一篇的基础上加入了CNN对电影title的特征提取。CNN 对文本的处理参考的是MXNET的教程:Text Classification Using a Convolutional Neural Network on MXNet 和论文:Convolutional Neural Networks for Sentence Classification


增加的部分如下:

1. title的预处理,主要是把title中的单词建立成字典,并且把所有title padded to max length,变成一个定宽的数据,类似于一张张相同大小的图片,这样应用不同size的核进行卷积。

def pad_sentences(sentences, padding_word="</s>"):
    """
    Pads all sentences to the same length. The length is defined by the longest sentence.
    Returns padded sentences.
    """
    sequence_length = max(len(x) for x in sentences)
    padded_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        num_padding = sequence_length - len(sentence)
        new_sentence = sentence + [padding_word] * num_padding
        padded_sentences.append(new_sentence)
    return padded_sentences

def build_vocab(sentences):
    """
    Builds a vocabulary mapping from word to index based on the sentences.
    Returns vocabulary mapping and inverse vocabulary mapping.
    """
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return [vocabulary, vocabulary_inv]

def buildTitles(sentences):
    sentences_padded = pad_sentences(sentences)
    vocabulary, vocabulary_inv = build_vocab(sentences_padded)
    vocab_size = len(vocabulary)
    '''
    Maps sentences and labels to vectors based on a vocabulary.
    '''
    x = pd.Series([[vocabulary[word] for word in sentence] for sentence in sentences_padded])
    return (x,vocab_size)
2. 网络定义

def get_one_layer_mlp( max_userid,  max_itemid,max_gender,max_age, max_job,max_title,sentence_size, k,batch_size):
    # user profile
    userid = mx.symbol.Variable('userid')
    gender = mx.symbol.Variable('gender')
    age = mx.symbol.Variable('age')
    job = mx.symbol.Variable('job')

    #times profile
    itemid = mx.symbol.Variable('itemid')
    title = mx.symbol.Variable('title')
    cat = mx.symbol.Variable('cat')


    score = mx.symbol.Variable('score')
    # user latent features
    userid = mx.symbol.Embedding(data = userid, input_dim = max_userid, output_dim = k/2,name='userid_Embedding')
    userid = mx.symbol.FullyConnected(data = userid, num_hidden = k/2)

    gender = mx.symbol.Embedding(data = gender, input_dim = max_gender, output_dim = k/4,name='gender_Embedding')
    gender = mx.symbol.FullyConnected(data = gender, num_hidden = k/4)

    age = mx.symbol.Embedding(data = age, input_dim = max_age, output_dim = k/2,name='age_Embedding')
    age = mx.symbol.FullyConnected(data =age, num_hidden =  k/2)

    job = mx.symbol.Embedding(data = job, input_dim = max_job, output_dim = k/2,name='job_Embedding')
    job = mx.symbol.FullyConnected(data =job, num_hidden = k/2)

    user =  mx.symbol.concat(userid,gender,age,job,dim=1)
    user = mx.symbol.FullyConnected(data =user, num_hidden = k)
    user = mx.symbol.Activation(data = user, act_type="relu")

    # item latent features
    itemid = mx.symbol.Embedding(data = itemid, input_dim = max_itemid, output_dim = k/2,name='itemid_Embedding')
    itemid = mx.symbol.FullyConnected(data = itemid, num_hidden = k/2)

    # item title features
    num_embed = k/2
    embed_layer = mx.symbol.Embedding(data = title, input_dim = max_title, output_dim = num_embed,name='title_Embedding')
    conv_input = mx.symbol.Reshape(data= embed_layer,target_shape = (batch_size,1,sentence_size,num_embed))
    # create convolution + (max) pooling layer for each filter operation
    filter_list=[3,4,5] # the size of filters to use [3, 4, 5]
    num_filter=50
    pooled_outputs = []
    for i, filter_size in enumerate(filter_list):
        convi = mx.symbol.Convolution(data=conv_input, kernel=(filter_size, num_embed), num_filter=num_filter)
        relui = mx.symbol.Activation(data=convi, act_type='relu')
        pooli = mx.symbol.Pooling(data=relui, pool_type='max', kernel=(sentence_size - filter_size + 1, 1), stride=(1,1))
        pooled_outputs.append(pooli)

    # combine all pooled outputs
    total_filters = num_filter * len(filter_list)
    concat = mx.symbol.Concat(*pooled_outputs, dim=1)

    # reshape for next layer
    h_pool = mx.symbol.Reshape(data=concat, target_shape=(batch_size, total_filters))
    h_drop = mx.sym.Dropout(data=h_pool, p=0.5)

    #fix length of output
    title = mx.sym.FullyConnected(data=h_drop, num_hidden = k/2,name='title_Fc')


    #categories latent features
    cat = mx.symbol.FullyConnected(data = cat, num_hidden = k/2,name='cat_Fc')

    #concate all item infos
    item = mx.symbol.concat(itemid,title,cat,dim=1)
    item = mx.symbol.FullyConnected(data =item, num_hidden = k)
    item = mx.symbol.Activation(data = item, act_type="relu")

    pred = calc_cos_sim(user,item,1,5)
    pred = mx.symbol.Flatten(data = pred)
    # loss layer
    pred = mx.symbol.LinearRegressionOutput(data = pred, label = score)
    return pred

3. 训练

def trainingModel():
    TRAIN_DIR = 'C:/Users/chuanxie/PycharmProjects/mxnetlearn/data/movie/'
    ratingdf = LoadRatingData(TRAIN_DIR+'ml-1m/ratings.dat',delimiter='\t')
    userdf = LoadUserData(TRAIN_DIR+'ml-1m/users.dat',delimiter='\t')
    itemdf = LoadItemData(TRAIN_DIR+'ml-1m/movies.dat',delimiter='\t')
    np_encodedcat = encodeTag(itemdf)
    print 'ratingdf.shape:',ratingdf.shape
    print 'np_encodedcat.shape:'  ,np_encodedcat.shape
    fulldf = ratingdf.join(userdf,on='userid').join(itemdf,on='itemid').join(np_encodedcat,on='itemid')

    '''titles'''
    titlematrix = fulldf['title'].as_matrix()
    titles = [s.split(" ") for s in titlematrix]
    title_arr,vocab_size = buildTitles(titles)
    sentence_size = len(title_arr[0])
    print 'title_arr.shape:',title_arr.shape
    '''reconstruct series to dataframe'''
    matrix_encoded_cat = fulldf['encoded_cat'].as_matrix()
    df_encoded_cat = np.array(matrix_encoded_cat.tolist())
    print 'df_encoded_cat.shape:',df_encoded_cat.shape

    data = np.array([fulldf['userid'],fulldf['gender'],fulldf['age'],fulldf['job'],fulldf['itemid'],title_arr,fulldf['encoded_cat']])
    print 'train data shape:',data.shape
    label = np.array([fulldf['score']])

    context = mx.gpu()
    BATCH_SIZE = 400
    num_epoch = 200
    trainIter = CustDataIter2(['userid', 'gender','age','job','itemid','title','cat'],data,
                             [(BATCH_SIZE,),(BATCH_SIZE,),(BATCH_SIZE,),(BATCH_SIZE,),(BATCH_SIZE,),(BATCH_SIZE,sentence_size),(BATCH_SIZE,df_encoded_cat.shape[1])],
                             ['score'],label,[(BATCH_SIZE,)],context,BATCH_SIZE,data.shape[1]/BATCH_SIZE)

    max_userid = pd.Series(fulldf['userid']).max()
    max_itemid = pd.Series(fulldf['itemid']).max()
    max_gender = pd.Series(fulldf['gender']).max()
    max_age = pd.Series(fulldf['age']).max()
    max_job = pd.Series(fulldf['job']).max()
    max_title = vocab_size

    net =get_one_layer_mlp( max_userid=max_userid,  max_itemid=max_itemid,max_gender=max_gender,
                            max_age = max_age , max_job = max_job,max_title=max_title,sentence_size=sentence_size, k=96,batch_size=BATCH_SIZE)
    mx.viz.plot_network(net,shape={'userid':(BATCH_SIZE,),'gender':(BATCH_SIZE,),'age':(BATCH_SIZE,),'job':(BATCH_SIZE,),'itemid':(BATCH_SIZE,),'title':(BATCH_SIZE,sentence_size),'cat':(BATCH_SIZE,73)}).view()


    ##Train module
    train(net,trainIter,None,context,num_epoch=num_epoch,learning = 'rmsprop',learning_rate=0.001)

加入CNN之后,显存消耗和计算量都有了巨大增加。没有加CNN之前用的batch size是10K, 加入CNN之后batch size只能调成500了,我的显卡只有1G,batch size大了之后就黑屏了。。。。, 训练一次的时间也有10s以内上上涨到2分多钟

INFO:root:Epoch[0] Train-rmse=0.975520
INFO:root:Epoch[0] Time cost=136.348
INFO:root:Epoch[1] Train-rmse=0.933322
INFO:root:Epoch[1] Time cost=134.050
INFO:root:Epoch[2] Train-rmse=0.912823
INFO:root:Epoch[2] Time cost=133.927
INFO:root:Epoch[3] Train-rmse=0.899103
INFO:root:Epoch[3] Time cost=133.942
INFO:root:Epoch[4] Train-rmse=0.893966
INFO:root:Epoch[4] Time cost=134.107

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值