上一篇文章的网络定义严格来说并不是严格按照paddle2给的那个例子来的。本篇文章在上一篇的基础上加入了CNN对电影title的特征提取。CNN 对文本的处理参考的是MXNET的教程:Text Classification Using a Convolutional Neural Network on MXNet 和论文:Convolutional Neural Networks for Sentence Classification
增加的部分如下:
1. title的预处理,主要是把title中的单词建立成字典,并且把所有title padded to max length,变成一个定宽的数据,类似于一张张相同大小的图片,这样应用不同size的核进行卷积。
def pad_sentences(sentences, padding_word="</s>"):
"""
Pads all sentences to the same length. The length is defined by the longest sentence.
Returns padded sentences.
"""
sequence_length = max(len(x) for x in sentences)
padded_sentences = []
for i in range(len(sentences)):
sentence = sentences[i]
num_padding = sequence_length - len(sentence)
new_sentence = sentence + [padding_word] * num_padding
padded_sentences.append(new_sentence)
return padded_sentences
def build_vocab(sentences):
"""
Builds a vocabulary mapping from word to index based on the sentences.
Returns vocabulary mapping and inverse vocabulary mapping.
"""
# Build vocabulary
word_counts = Counter(itertools.chain(*sentences))
# Mapping from index to word
vocabulary_inv = [x[0] for x in word_counts.most_common()]
# Mapping from word to index
vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
return [vocabulary, vocabulary_inv]
def buildTitles(sentences):
sentences_padded = pad_sentences(sentences)
vocabulary, vocabulary_inv = build_vocab(sentences_padded)
vocab_size = len(vocabulary)
'''
Maps sentences and labels to vectors based on a vocabulary.
'''
x = pd.Series([[vocabulary[word] for word in sentence] for sentence in sentences_padded])
return (x,vocab_size)
2. 网络定义
def get_one_layer_mlp( max_userid, max_itemid,max_gender,max_age, max_job,max_title,sentence_size, k,batch_size):
# user profile
userid = mx.symbol.Variable('userid')
gender = mx.symbol.Variable('gender')
age = mx.symbol.Variable('age')
job = mx.symbol.Variable('job')
#times profile
itemid = mx.symbol.Variable('itemid')
title = mx.symbol.Variable('title')
cat = mx.symbol.Variable('cat')
score = mx.symbol.Variable('score')
# user latent features
userid = mx.symbol.Embedding(data = userid, input_dim = max_userid, output_dim = k/2,name='userid_Embedding')
userid = mx.symbol.FullyConnected(data = userid, num_hidden = k/2)
gender = mx.symbol.Embedding(data = gender, input_dim = max_gender, output_dim = k/4,name='gender_Embedding')
gender = mx.symbol.FullyConnected(data = gender, num_hidden = k/4)
age = mx.symbol.Embedding(data = age, input_dim = max_age, output_dim = k/2,name='age_Embedding')
age = mx.symbol.FullyConnected(data =age, num_hidden = k/2)
job = mx.symbol.Embedding(data = job, input_dim = max_job, output_dim = k/2,name='job_Embedding')
job = mx.symbol.FullyConnected(data =job, num_hidden = k/2)
user = mx.symbol.concat(userid,gender,age,job,dim=1)
user = mx.symbol.FullyConnected(data =user, num_hidden = k)
user = mx.symbol.Activation(data = user, act_type="relu")
# item latent features
itemid = mx.symbol.Embedding(data = itemid, input_dim = max_itemid, output_dim = k/2,name='itemid_Embedding')
itemid = mx.symbol.FullyConnected(data = itemid, num_hidden = k/2)
# item title features
num_embed = k/2
embed_layer = mx.symbol.Embedding(data = title, input_dim = max_title, output_dim = num_embed,name='title_Embedding')
conv_input = mx.symbol.Reshape(data= embed_layer,target_shape = (batch_size,1,sentence_size,num_embed))
# create convolution + (max) pooling layer for each filter operation
filter_list=[3,4,5] # the size of filters to use [3, 4, 5]
num_filter=50
pooled_outputs = []
for i, filter_size in enumerate(filter_list):
convi = mx.symbol.Convolution(data=conv_input, kernel=(filter_size, num_embed), num_filter=num_filter)
relui = mx.symbol.Activation(data=convi, act_type='relu')
pooli = mx.symbol.Pooling(data=relui, pool_type='max', kernel=(sentence_size - filter_size + 1, 1), stride=(1,1))
pooled_outputs.append(pooli)
# combine all pooled outputs
total_filters = num_filter * len(filter_list)
concat = mx.symbol.Concat(*pooled_outputs, dim=1)
# reshape for next layer
h_pool = mx.symbol.Reshape(data=concat, target_shape=(batch_size, total_filters))
h_drop = mx.sym.Dropout(data=h_pool, p=0.5)
#fix length of output
title = mx.sym.FullyConnected(data=h_drop, num_hidden = k/2,name='title_Fc')
#categories latent features
cat = mx.symbol.FullyConnected(data = cat, num_hidden = k/2,name='cat_Fc')
#concate all item infos
item = mx.symbol.concat(itemid,title,cat,dim=1)
item = mx.symbol.FullyConnected(data =item, num_hidden = k)
item = mx.symbol.Activation(data = item, act_type="relu")
pred = calc_cos_sim(user,item,1,5)
pred = mx.symbol.Flatten(data = pred)
# loss layer
pred = mx.symbol.LinearRegressionOutput(data = pred, label = score)
return pred
3. 训练
def trainingModel():
TRAIN_DIR = 'C:/Users/chuanxie/PycharmProjects/mxnetlearn/data/movie/'
ratingdf = LoadRatingData(TRAIN_DIR+'ml-1m/ratings.dat',delimiter='\t')
userdf = LoadUserData(TRAIN_DIR+'ml-1m/users.dat',delimiter='\t')
itemdf = LoadItemData(TRAIN_DIR+'ml-1m/movies.dat',delimiter='\t')
np_encodedcat = encodeTag(itemdf)
print 'ratingdf.shape:',ratingdf.shape
print 'np_encodedcat.shape:' ,np_encodedcat.shape
fulldf = ratingdf.join(userdf,on='userid').join(itemdf,on='itemid').join(np_encodedcat,on='itemid')
'''titles'''
titlematrix = fulldf['title'].as_matrix()
titles = [s.split(" ") for s in titlematrix]
title_arr,vocab_size = buildTitles(titles)
sentence_size = len(title_arr[0])
print 'title_arr.shape:',title_arr.shape
'''reconstruct series to dataframe'''
matrix_encoded_cat = fulldf['encoded_cat'].as_matrix()
df_encoded_cat = np.array(matrix_encoded_cat.tolist())
print 'df_encoded_cat.shape:',df_encoded_cat.shape
data = np.array([fulldf['userid'],fulldf['gender'],fulldf['age'],fulldf['job'],fulldf['itemid'],title_arr,fulldf['encoded_cat']])
print 'train data shape:',data.shape
label = np.array([fulldf['score']])
context = mx.gpu()
BATCH_SIZE = 400
num_epoch = 200
trainIter = CustDataIter2(['userid', 'gender','age','job','itemid','title','cat'],data,
[(BATCH_SIZE,),(BATCH_SIZE,),(BATCH_SIZE,),(BATCH_SIZE,),(BATCH_SIZE,),(BATCH_SIZE,sentence_size),(BATCH_SIZE,df_encoded_cat.shape[1])],
['score'],label,[(BATCH_SIZE,)],context,BATCH_SIZE,data.shape[1]/BATCH_SIZE)
max_userid = pd.Series(fulldf['userid']).max()
max_itemid = pd.Series(fulldf['itemid']).max()
max_gender = pd.Series(fulldf['gender']).max()
max_age = pd.Series(fulldf['age']).max()
max_job = pd.Series(fulldf['job']).max()
max_title = vocab_size
net =get_one_layer_mlp( max_userid=max_userid, max_itemid=max_itemid,max_gender=max_gender,
max_age = max_age , max_job = max_job,max_title=max_title,sentence_size=sentence_size, k=96,batch_size=BATCH_SIZE)
mx.viz.plot_network(net,shape={'userid':(BATCH_SIZE,),'gender':(BATCH_SIZE,),'age':(BATCH_SIZE,),'job':(BATCH_SIZE,),'itemid':(BATCH_SIZE,),'title':(BATCH_SIZE,sentence_size),'cat':(BATCH_SIZE,73)}).view()
##Train module
train(net,trainIter,None,context,num_epoch=num_epoch,learning = 'rmsprop',learning_rate=0.001)
加入CNN之后,显存消耗和计算量都有了巨大增加。没有加CNN之前用的batch size是10K, 加入CNN之后batch size只能调成500了,我的显卡只有1G,batch size大了之后就黑屏了。。。。, 训练一次的时间也有10s以内上上涨到2分多钟
INFO:root:Epoch[0] Train-rmse=0.975520
INFO:root:Epoch[0] Time cost=136.348
INFO:root:Epoch[1] Train-rmse=0.933322
INFO:root:Epoch[1] Time cost=134.050
INFO:root:Epoch[2] Train-rmse=0.912823
INFO:root:Epoch[2] Time cost=133.927
INFO:root:Epoch[3] Train-rmse=0.899103
INFO:root:Epoch[3] Time cost=133.942
INFO:root:Epoch[4] Train-rmse=0.893966
INFO:root:Epoch[4] Time cost=134.107