文献:
《Factorization Machines》,没法上传文献
原理解读:
FM主要是解决稀疏数据下的特征组合问题
普通的线性模型LR,我们都是将各个特征独立考虑的,并没有考虑到特征与特征之间的相互关系:
但实际上,特征之间可能具有一定的关联。以新闻推荐为例,一般男性用户看军事新闻多,而女性用户喜欢情感类新闻,那么可以看出性别与新闻的频道有一定的关联性,如果能找出这类的特征,是非常有意义的:
神经网络中的实现
出现了很多将fm与神经网络结合的模型,如deepfm、nfm等。下面讲解一下如何在代码中实现。
先讲一下难点在于二阶特征的形成,化简为和之积与积之和相减。这是推导过程:
deepfm的代码在keras中实现:
分为一阶、二阶、deep部分。
其中continuous_tensors代表连续型变量
的组合,embeddings_tensors代表离散型变量做embedding后的组合。
global deep_cols
global embedding_cols
global cont_cols
with open("./data/embedding_number_v4.json",'r', encoding='UTF-8') as f:
load_dict = json.load(f)
reg = 1e-4
v_factors=4
unique_vals=load_dict
embeddings_first_order_tensors=[]
embeddings_second_order_tensors=[]
input_layers=[]
for ec in embedding_cols:
layer_name = ec
vocabulary_size=unique_vals[ec]
inp = Input(shape=(1,), dtype='int64', name=layer_name)
input_layers.append(inp)
# 1 order
embeddings_first_order_tensor=Embedding(vocabulary_size, 1, input_length=1)(inp)
embeddings_first_order_tensors.append(embeddings_first_order_tensor)
# 2order
embeddings_second_order_tensor=Embedding(vocabulary_size,v_factors,input_length=1)(inp)
embeddings_second_order_tensors.append(embeddings_second_order_tensor)
del embeddings_second_order_tensor
continuous_first_order_tensors = []
continuous_second_order_tensors=[]
for cc in cont_cols:
layer_name = cc
inp = Input(shape=(1,), dtype='int64', name=layer_name)
input_layers.append(inp)
inp_2=Reshape((1, 1))(inp)
inp_3=Lambda(int_to_float)(inp_2)
# 1 order
continuous_first_order_tensor=Dense(1)(inp_3)
continuous_first_order_tensors.append(continuous_first_order_tensor)
# 2 order
#continuous_second_order_tensor=RepeatVector(1)(Dense(v_factors)(inp_3))
continuous_second_order_tensor=Dense(v_factors)(inp_3)
continuous_second_order_tensors.append(continuous_second_order_tensor)
del continuous_second_order_tensor
# 1-order
#first_order=Flatten()(Add()(continuous_first_order_tensors+embeddings_first_order_tensors))# 1
first_order=concatenate(inputs=continuous_first_order_tensors+embeddings_first_order_tensors,axis=2)
first_order=Dense(1,kernel_regularizer=l2(0.01))(first_order)
print("first_order:",first_order)
# 2-order
second_order_inp=embeddings_second_order_tensors+continuous_second_order_tensors
print("second_order",second_order_inp)
# _________sum square part _____________
summed_features_emb=Add()(second_order_inp) #input 列表
print("summed_features_emb:",summed_features_emb)
summed_features_emb_square=Lambda(lambda x: tf.square(x))(summed_features_emb)
print("summed_features_emb_square:",summed_features_emb_square)
# _________ square_sum part _____________
squared_features_emb=Lambda(lambda x: tf.square(x))(second_order_inp)
print("squared_features_emb:",squared_features_emb)
squared_sum_features_emb=Lambda(lambda x: tf.reduce_sum(x,0))(squared_features_emb)
print("squared_sum_features_emb:",squared_sum_features_emb)
second_order=Lambda(lambda x: 0.5 * tf.subtract(x[0],x[1]))([summed_features_emb_square, squared_sum_features_emb])
#second_order=Flatten()(second_order)
# deep-part
#d = Flatten()(concatenate(second_order_inp))
d = concatenate(second_order_inp)
d = Dense(128, activation='relu',kernel_regularizer=l2(0.01))(d)
#d=Dropout(0.5)(d)
d = Dense(64, activation='relu',kernel_regularizer=l2(0.01))(d)
#d=Dropout(0.5)(d)
d = concatenate([first_order,second_order,d])
out = Dense(1, activation='sigmoid')(d)
print("out:",out)
out=Reshape([1])(out)
deep = Model(input_layers, out)
deep.summary()
具体实现见:https://github.com/zhangyingerjelly/recommendation/tree/master/%E5%8D%95%E7%9B%AE%E6%A0%87%E6%8E%A8%E8%8D%90%E6%A8%A1%E5%9E%8B