1. 理论
FM模型:
和LR模型相比,FM模型引入了二阶特征组合项:
y
(
x
)
=
w
0
+
∑
i
=
1
n
w
i
x
i
+
∑
i
=
1
n
∑
j
=
i
+
1
n
w
i
j
x
i
x
j
y(\mathbf{x})=w_{0}+\sum_{i=1}^{n} w_{i} x_{i}+\sum_{i=1}^{n} \sum_{j=i+1}^{n} w_{i j} x_{i} x_{j}
y(x)=w0+i=1∑nwixi+i=1∑nj=i+1∑nwijxixj
n n n 代表样本的特征数量, x i x_i xi 是第 i i i个特征的值, w 0 , w i , w i j w_0, w_i, w_{ij} w0,wi,wij 是模型的参数。
经过矩阵分解 W = V T V W = V^T V W=VTV:
y ( x ) = w 0 + ∑ i = 1 n w i x i + ∑ i = 1 n ∑ j = i + 1 n ⟨ v i , v j ⟩ x i x j y(\mathbf{x})=w_{0}+\sum_{i=1}^{n} w_{i} x_{i}+\sum_{i=1}^{n} \sum_{j=i+1}^{n}\left\langle\mathbf{v}_{i}, \mathbf{v}_{j}\right\rangle x_{i} x_{j} y(x)=w0+i=1∑nwixi+i=1∑nj=i+1∑n⟨vi,vj⟩xixj
其中, V V V的维度是 n ∗ k n*k n∗k, 其中 k < < n k << n k<<n.
采样MSE损失函数求解回归问题,采样Hinge或者Cross-entropy损失求解分类问题。二元分类,FM的输出经过sigmoid变换。
二次项的简化技巧:
∑ i = 1 n ∑ j = i + 1 n ⟨ v i , v j ⟩ x i x j = 1 2 ∑ f = 1 k ( ( ∑ i = 1 n v i , f x i ) 2 − ∑ i = 1 n v i , f 2 x i 2 ) \sum_{i=1}^{n} \sum_{j=i+1}^{n}\left\langle\mathbf{v}_{i}, \mathbf{v}_{j}\right\rangle x_{i} x_{j}=\frac{1}{2} \sum_{f=1}^{k}\left(\left(\sum_{i=1}^{n} v_{i, f} x_{i}\right)^{2}-\sum_{i=1}^{n} v_{i, f}^{2} x_{i}^{2}\right) i=1∑nj=i+1∑n⟨vi,vj⟩xixj=21f=1∑k⎝⎛(i=1∑nvi,fxi)2−i=1∑nvi,f2xi2⎠⎞
简化后的FM复杂度 O ( k ∗ n ) O(k*n) O(k∗n)
2. python实现
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.datasets import load_breast_cancer
import keras.backend as K
from keras.layers import Input,Dense,Add,Activation
from keras import optimizers
from keras.models import Model
from keras.engine.topology import Layer
class CrossLayer(Layer):
def __init__(self, input_dim, factor_order=10, **kwargs):
self.input_dim = input_dim
self.factor_order = factor_order
super(CrossLayer, self).__init__(**kwargs)
def build(self, input_shape):
self.kernel = self.add_weight(name='kernel',
shape=(self.input_dim, self.factor_order),
initializer='glorot_uniform',
trainable=True)
super(CrossLayer, self).build(input_shape)
def call(self, X):
square_of_sum = K.pow(K.dot(X,self.kernel), 2)
sum_of_square = K.dot(K.pow(X,2), K.pow(self.kernel,2))
return 0.5*K.sum(square_of_sum - sum_of_square, axis=1, keepdims=True)
def compute_output_shape(self, input_shape):
return (input_shape[0], 1)
class FM():
def __init__(self, input_dim=None, factor_order=2, output_dim=1, epochs=10, batch_size=256):
self.input_dim = input_dim
self.output_dim = output_dim
self.factor_order = factor_order
self.epochs = epochs
self.batch_size = batch_size
def build_model(self,):
X = Input(shape=(self.input_dim,))
linear_term = Dense(units=1)(X)
cross_term = CrossLayer(self.input_dim, self.factor_order)(X)
logits = Add()([linear_term, cross_term])
predictions = Activation('sigmoid')(logits)
self.model = Model(inputs=X, outputs=predictions)
self.model.compile(loss='binary_crossentropy', optimizer='adam')
print(self.model.summary())
return self.model
def fit(self,X_train, y_train):
self.model.fit(X_train, y_train, epochs=self.epochs, batch_size=self.batch_size)
return self.model
def predict(self, X_test,batch_size=None):
if batch_size is None:
batch_size = self.batch_size
return self.model.predict(X_test, batch_size=batch_size)
if __name__ == "__main__":
data = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2,
random_state=27, stratify=data.target)
fm = FM(30)
fm.build_model()
fm.fit(X_train, y_train)
pred_ans = fm.predict(X_test)
print('pred_ans',pred_ans)
print("test AUC", round(roc_auc_score(y_test, pred_ans), 4))
运行结果:
test AUC 0.5
跑出来的结果有点怪,可能数据集不太适合。
参考: