tensorflow2基于criteo数据集实现wide&deep
criteo数据集
导包
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
读取数据集
data = pd.read_csv('./data/criteo_sample_data.csv',sep=',',nrows=1000,header=0)
可见:该数据集第一列为label,I开头的col为数值型数据,C开头的col为类别型数据,具体columns为
数据粗粒度处理,使其可以运用tf2
1.查看下null比例
col_null = []
for col in data.columns:
percent = data[col].isnull().sum()/data.shape[0]
col_null.append([col,percent])
sorted(col_null,key = lambda x:x[1],reverse=True)
[['C22', 0.815],
['I12', 0.775],
['C19', 0.498],
['C20', 0.498],
['C25', 0.498],
['C26', 0.498],
['I1', 0.458],
['I10', 0.458],
['I6', 0.248],
['I3', 0.23],
['I4', 0.213],
['I13', 0.213],
['C6', 0.122],
['I5', 0.041],
['I7', 0.039],
['I9', 0.039],
['I11', 0.039],
['C3', 0.033],
['C4', 0.033],
['C12', 0.033],
['C16', 0.033],
['C21', 0.033],
['C24', 0.033],
因此,先对缺失值进行填充
2.数据处理
sparce_feat = [col for col in data.columns if col.startswith('C')]
dense_feat = [col for col in data.columns if col.startswith('I')]
for col in dense_feat:
data[col] =np.log1p(data[col].fillna(0.0))
for col in sparce_feat:
data[col] = data[col].fillna('其他')
label_e = LabelEncoder()
data[col] = label_e.fit_transform(data[col])
粗粒度处理后可得
此处特征工程不注重特征工程,重在用tf2实现wide&deep模型,但是实际工程落地中,特征工程和数据清洗的作用十分重要
wide&deep
wide&deep模型分为两部分,分别为wide部分与DNN部分,两者分别去提取低阶特征与高阶交叉特征信息,总重对目标进行预估。因此我们的实现部分也会按照wide和DNN部分来进行拆分。
1. 预备工作
这里使用nametuple是为了记录每一个col的信息以及想要embedding的维度等,方便后期直接取值使用
(1)SparseFeat
class SparseFeat(namedtuple('SparseFeat',
['name', 'vocabulary_size', 'embedding_dim', 'use_hash', 'dtype', 'embeddings_initializer',
'embedding_name','trainable'])):
__slots__ = ()
def __new__(cls, name, vocabulary_size, embedding_dim=8, use_hash=False, dtype="int32", embeddings_initializer=None,
embedding_name=None, trainable=True):
if embedding_dim == "auto":
embedding_dim = 6 * int(pow(vocabulary_size, 0.25))
if embeddings_initializer is None:
embeddings_initializer = RandomNormal(mean=0.0, stddev=0.0001, seed=2020)
if embedding_name is None:
embedding_name = name
return super(SparseFeat, cls).__new__(cls, name, vocabulary_size, embedding_dim, use_hash, dtype,
embeddings_initializer,
embedding_name, trainable)
def __hash__(self):
return self.name.__hash__()
(2) Densefeat
class DenseFeat(namedtuple('DenseFeat', ['name', 'dimension', 'dtype'])):
__slots__ = ()
def __new__(cls, name, dimension=1, dtype="float32"):
return super(DenseFeat, cls).__new__(cls, name, dimension, dtype)
def __hash__(self):
return self.name.__hash__()
(3) 创建dict来存储Input
def build_input_features(feature_columns, prefix=''):
input_features = OrderedDict()
for fc in feature_columns:
if isinstance(fc, SparseFeat):
input_features[fc.name] = Input(
shape=(1,), name=prefix + fc.name, dtype=tf.float32)
elif isinstance(fc, DenseFeat):
input_features[fc.name] = Input(
shape=(fc.dimension,), name=prefix + fc.name, dtype=tf.float32)
return input_features
(4) 获取sparse列的信息
def get_sparse_input(features, feature_columns):
sparse_feature_columns = list(
filter(lambda x: isinstance(x, SparseFeat), feature_columns)) if feature_columns else []
sparse_input_list = []
for fc in sparse_feature_columns:
sparse_input_list.append(features[fc.name])
return sparse_input_list
(5) 获取dense列的信息
def get_dense_input(features, feature_columns):
dense_feature_columns = list(
filter(lambda x: isinstance(x, DenseFeat), feature_columns)) if feature_columns else []
dense_input_list = []
for fc in dense_feature_columns:
dense_input_list.append(features[fc.name])
return dense_input_list
2. wide侧
在wide&deep中,wide侧其实就是一个线性回归,现在我们来实现这个wide侧的网络构建
def get_linear_logit(features_inputs, feature_columns, use_bias=False, seed=1024, prefix='linear',
l2_reg=0):
sparse_input = get_sparse_input(features_inputs,feature_columns)
dense_input = get_dense_input(features_inputs, feature_columns)
dense_inputs= tf.keras.layers.Concatenate(axis=-1)(dense_input)
sparse_inputs = tf.keras.layers.Concatenate(axis=-1)(sparse_input)
linear_features = tf.concat([sparse_inputs,dense_inputs],axis=1)
linear_logit = tf.keras.layers.Dense(1,activation=None,use_bias=use_bias,
name=prefix)(linear_features)
return linear_logit
3.deep侧
deep侧其实就是一个DNN模型,但是在实现dnn模型之前,我们得先实现sparse_feat的embedding化
def get_sparse_emb_list(features,feature_columns,l2_emb_reg,prefix='sparse_'):
sparse_feature_columns = list(
filter(lambda x: isinstance(x, SparseFeat), feature_columns)) if feature_columns else []
sparse_emb_list = []
for feat in sparse_feature_columns:
emb = Embedding(feat.vocabulary_size, feat.embedding_dim,
embeddings_initializer=feat.embeddings_initializer,
embeddings_regularizer=tf.keras.regularizers.l2(l2_emb_reg),
name=prefix + '_emb_' + feat.embedding_name,
trainable=feat.trainable)(features[feat.name])
sparse_emb_list.append(emb)
return sparse_emb_list
DNN代码如下:
class DNN(Layer):
"""The Multi Layer Percetron
Input shape
- nD tensor with shape: ``(batch_size, ..., input_dim)``. The most common situation would be a 2D input with shape ``(batch_size, input_dim)``.
Output shape
- nD tensor with shape: ``(batch_size, ..., hidden_size[-1])``. For instance, for a 2D input with shape ``(batch_size, input_dim)``, the output would have shape ``(batch_size, hidden_size[-1])``.
Arguments
- **hidden_units**:list of positive integer, the layer number and units in each layer.
- **activation**: Activation function to use.
- **l2_reg**: float between 0 and 1. L2 regularizer strength applied to the kernel weights matrix.
- **dropout_rate**: float in [0,1). Fraction of the units to dropout.
- **use_bn**: bool. Whether use BatchNormalization before activation or not.
- **seed**: A Python integer to use as random seed.
"""
def __init__(self, hidden_units, activation='relu', l2_reg=0, dropout_rate=0, use_bn=False, seed=1024, **kwargs):
self.hidden_units = hidden_units
self.activation = activation
self.dropout_rate = dropout_rate
self.seed = seed
self.l2_reg = l2_reg
self.use_bn = use_bn
super(DNN, self).__init__(**kwargs)
def build(self, input_shape):
# if len(self.hidden_units) == 0:
# raise ValueError("hidden_units is empty")
input_size = input_shape[-1]
hidden_units = [int(input_size)] + list(self.hidden_units)
self.kernels = [self.add_weight(name='kernel' + str(i),
shape=(
hidden_units[i], hidden_units[i + 1]),
initializer=glorot_normal(
seed=self.seed),
regularizer=l2(self.l2_reg),
trainable=True) for i in range(len(self.hidden_units))]
self.bias = [self.add_weight(name='bias' + str(i),
shape=(self.hidden_units[i],),
initializer=Zeros(),
trainable=True) for i in range(len(self.hidden_units))]
if self.use_bn:
self.bn_layers = [tf.keras.layers.BatchNormalization() for _ in range(len(self.hidden_units))]
self.dropout_layers = [tf.keras.layers.Dropout(self.dropout_rate, seed=self.seed + i) for i in
range(len(self.hidden_units))]
self.activation_layers = [Activation(self.activation) for _ in range(len(self.hidden_units))]
super(DNN, self).build(input_shape) # Be sure to call this somewhere!
def call(self, inputs, training=None, **kwargs):
deep_input = inputs
for i in range(len(self.hidden_units)):
fc = tf.nn.bias_add(tf.tensordot(
deep_input, self.kernels[i], axes=(-1, 0)), self.bias[i])
# fc = Dense(self.hidden_size[i], activation=None, \
# kernel_initializer=glorot_normal(seed=self.seed), \
# kernel_regularizer=l2(self.l2_reg))(deep_input)
if self.use_bn:
fc = self.bn_layers[i](fc, training=training)
fc = self.activation_layers[i](fc)
fc = self.dropout_layers[i](fc, training=training)
deep_input = fc
return deep_input
def compute_output_shape(self, input_shape):
if len(self.hidden_units) > 0:
shape = input_shape[:-1] + (self.hidden_units[-1],)
else:
shape = input_shape
return tuple(shape)
def get_config(self, ):
config = {'activation': self.activation, 'hidden_units': self.hidden_units,
'l2_reg': self.l2_reg, 'use_bn': self.use_bn, 'dropout_rate': self.dropout_rate, 'seed': self.seed}
base_config = super(DNN, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
sparse_emb = get_sparse_emb_list(features_inputs,linear_feature_columns,l2_reg_embedding,prefix='sparse_')
# DNN
dense_input = get_dense_input(features_inputs, dnn_feature_columns)
dense_input = Concatenate(axis=1)(dense_input)
dnn_input = tf.concat([Flatten()(Concatenate(axis=1)(sparse_emb)),dense_input],axis=1)
dnn_output = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout,
dnn_use_bn, seed)(dnn_input)
dnn_logit = Dense(1,activation='sigmoid',name='dnn_logit',
kernel_regularizer=tf.keras.regularizers.l2(l2_reg_dnn),
use_bias=False)(dnn_output)
4. wide + deep
out_put =Add()([linear_logit,dnn_logit])
out_put = Activation('sigmoid')(out_put)
model = tf.keras.models.Model(inputs=inputs_list, outputs=out_put)
至此,模型便是构建完成了
5. 模型构建的完整代码
def Wide_Deep(linear_feature_columns, dnn_feature_columns, dnn_hidden_units=(128,128,64,32),
l2_reg_linear=0.00001, l2_reg_embedding=0.00001, l2_reg_dnn=0, seed=1024, dnn_dropout=0,
dnn_activation='relu', dnn_use_bn=True, task='binary'):
"""Instantiates the wide&deep Network architecture.
:param linear_feature_columns: An iterable containing all the features used by linear part of the model.
:param dnn_feature_columns: An iterable containing all the features used by deep part of the model.
:param fm_group: list, group_name of features that will be used to do feature interactions.
:param dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of DNN
:param l2_reg_linear: float. L2 regularizer strength applied to linear part
:param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector
:param l2_reg_dnn: float. L2 regularizer strength applied to DNN
:param seed: integer ,to use as random seed.
:param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate.
:param dnn_activation: Activation function to use in DNN
:param dnn_use_bn: bool. Whether use BatchNormalization before activation or not in DNN
:param task: str, ``"binary"`` for binary logloss or ``"regression"`` for regression loss
:return: A Keras model instance.
"""
features_inputs = build_input_features(linear_feature_columns + dnn_feature_columns)
inputs_list = list(features_inputs.values())
# LR
linear_logit = get_linear_logit(features_inputs, linear_feature_columns, seed=seed, prefix='linear',
l2_reg=l2_reg_linear)
sparse_emb = get_sparse_emb_list(features_inputs,linear_feature_columns,l2_reg_embedding,prefix='sparse_')
# DNN
dense_input = get_dense_input(features_inputs, dnn_feature_columns)
dense_input = Concatenate(axis=1)(dense_input)
dnn_input = tf.concat([Flatten()(Concatenate(axis=1)(sparse_emb)),dense_input],axis=1)
dnn_output = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout,
dnn_use_bn, seed)(dnn_input)
dnn_logit = Dense(1,activation='sigmoid',name='dnn_logit',
kernel_regularizer=tf.keras.regularizers.l2(l2_reg_dnn),
use_bias=False)(dnn_output)
# concat and Activation
out_put =Add()([linear_logit,dnn_logit])
out_put = Activation('sigmoid')(out_put)
model = tf.keras.models.Model(inputs=inputs_list, outputs=out_put)
return model
数据+模型运行
import pandas as pd
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from models import DeepFM,Wide_Deep,DCN,AutoInt,NFM,xDeepFM,AFM
from feature_columns import SparseFeat, DenseFeat
from utils import get_feature_names
if __name__ == "__main__":
data = pd.read_csv('../data/criteo_sample_data.csv',nrows=1000)
sparse_features = ['C' + str(i) for i in range(1, 27)]
dense_features = ['I' + str(i) for i in range(1, 14)]
data[sparse_features] = data[sparse_features].fillna('-1')
data[dense_features] = data[dense_features].fillna(0)
target = ['label']
# 1.Label Encoding for sparse features,and do simple Transformation for dense features
for feat in sparse_features:
lbe = LabelEncoder()
data[feat] = lbe.fit_transform(data[feat])
mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])
# 2.count #unique features for each sparse field,and record dense feature field name
import tensorflow as tf
import numpy as np
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=8,embeddings_initializer=tf.initializers.identity(np.zeros([data[feat].nunique(),8])))
for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
for feat in dense_features]
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
print("feature_names",feature_names)
# 3.generate input data for model
train, test = train_test_split(data, test_size=0.2,random_state=2018)
train_model_input = {name:train[name] for name in feature_names}
test_model_input = {name:test[name] for name in feature_names}
# 4.Define Model,train,predict and evaluate
model = Wide_Deep(linear_feature_columns, dnn_feature_columns, task='binary')
model.compile("adam", "binary_crossentropy",
metrics=['binary_crossentropy'], )
history = model.fit(train_model_input, train[target].values,
batch_size=128, epochs=10, verbose=2, validation_data=[test_model_input,test[target].values])
pred_ans = model.predict(test_model_input, batch_size=256)
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))