tensorflow2实现wide&deep

criteo数据集

导包

import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

读取数据集

data  = pd.read_csv('./data/criteo_sample_data.csv',sep=',',nrows=1000,header=0)

在这里插入图片描述
可见:该数据集第一列为label,I开头的col为数值型数据,C开头的col为类别型数据,具体columns为
在这里插入图片描述

数据粗粒度处理,使其可以运用tf2

1.查看下null比例
col_null = []
for col in data.columns:
    percent = data[col].isnull().sum()/data.shape[0]
    col_null.append([col,percent])
sorted(col_null,key = lambda x:x[1],reverse=True)
[['C22', 0.815],
 ['I12', 0.775],
 ['C19', 0.498],
 ['C20', 0.498],
 ['C25', 0.498],
 ['C26', 0.498],
 ['I1', 0.458],
 ['I10', 0.458],
 ['I6', 0.248],
 ['I3', 0.23],
 ['I4', 0.213],
 ['I13', 0.213],
 ['C6', 0.122],
 ['I5', 0.041],
 ['I7', 0.039],
 ['I9', 0.039],
 ['I11', 0.039],
 ['C3', 0.033],
 ['C4', 0.033],
 ['C12', 0.033],
 ['C16', 0.033],
 ['C21', 0.033],
 ['C24', 0.033],

因此,先对缺失值进行填充

2.数据处理
sparce_feat = [col for col in data.columns if col.startswith('C')]
dense_feat = [col for col in data.columns if col.startswith('I')]
for col in dense_feat:
    data[col] =np.log1p(data[col].fillna(0.0))
for col in sparce_feat:
    data[col] = data[col].fillna('其他')
    label_e = LabelEncoder()
    data[col] = label_e.fit_transform(data[col])

粗粒度处理后可得
在这里插入图片描述

此处特征工程不注重特征工程,重在用tf2实现wide&deep模型,但是实际工程落地中,特征工程和数据清洗的作用十分重要

wide&deep

wide&deep模型分为两部分,分别为wide部分与DNN部分,两者分别去提取低阶特征与高阶交叉特征信息,总重对目标进行预估。因此我们的实现部分也会按照wide和DNN部分来进行拆分。

1. 预备工作

这里使用nametuple是为了记录每一个col的信息以及想要embedding的维度等,方便后期直接取值使用

(1)SparseFeat
class SparseFeat(namedtuple('SparseFeat',
                            ['name', 'vocabulary_size', 'embedding_dim', 'use_hash', 'dtype', 'embeddings_initializer',
                             'embedding_name','trainable'])):
    __slots__ = ()

    def __new__(cls, name, vocabulary_size, embedding_dim=8, use_hash=False, dtype="int32", embeddings_initializer=None,
                embedding_name=None, trainable=True):

        if embedding_dim == "auto":
            embedding_dim = 6 * int(pow(vocabulary_size, 0.25))
        if embeddings_initializer is None:
            embeddings_initializer = RandomNormal(mean=0.0, stddev=0.0001, seed=2020)

        if embedding_name is None:
            embedding_name = name

        return super(SparseFeat, cls).__new__(cls, name, vocabulary_size, embedding_dim, use_hash, dtype,
                                              embeddings_initializer,
                                              embedding_name, trainable)

    def __hash__(self):
        return self.name.__hash__()
(2) Densefeat
class DenseFeat(namedtuple('DenseFeat', ['name', 'dimension', 'dtype'])):
    __slots__ = ()

    def __new__(cls, name, dimension=1, dtype="float32"):
        return super(DenseFeat, cls).__new__(cls, name, dimension, dtype)

    def __hash__(self):
        return self.name.__hash__()
(3) 创建dict来存储Input
def build_input_features(feature_columns, prefix=''):
    input_features = OrderedDict()
    for fc in feature_columns:
        if isinstance(fc, SparseFeat):
            input_features[fc.name] = Input(
                shape=(1,), name=prefix + fc.name, dtype=tf.float32)
        elif isinstance(fc, DenseFeat):
            input_features[fc.name] = Input(
                shape=(fc.dimension,), name=prefix + fc.name, dtype=tf.float32)
    return input_features
(4) 获取sparse列的信息
def get_sparse_input(features, feature_columns):
    sparse_feature_columns = list(
        filter(lambda x: isinstance(x, SparseFeat), feature_columns)) if feature_columns else []
    sparse_input_list = []
    for fc in sparse_feature_columns:
        sparse_input_list.append(features[fc.name])
    return sparse_input_list
(5) 获取dense列的信息
def get_dense_input(features, feature_columns):
    dense_feature_columns = list(
        filter(lambda x: isinstance(x, DenseFeat), feature_columns)) if feature_columns else []
    dense_input_list = []
    for fc in dense_feature_columns:
        dense_input_list.append(features[fc.name])
    return dense_input_list

2. wide侧

在这里插入图片描述

在wide&deep中,wide侧其实就是一个线性回归,现在我们来实现这个wide侧的网络构建

def get_linear_logit(features_inputs, feature_columns,  use_bias=False, seed=1024, prefix='linear',
                     l2_reg=0):
    sparse_input = get_sparse_input(features_inputs,feature_columns)
    dense_input = get_dense_input(features_inputs, feature_columns)


    dense_inputs= tf.keras.layers.Concatenate(axis=-1)(dense_input)
    sparse_inputs = tf.keras.layers.Concatenate(axis=-1)(sparse_input)


    linear_features = tf.concat([sparse_inputs,dense_inputs],axis=1)
    linear_logit = tf.keras.layers.Dense(1,activation=None,use_bias=use_bias,
                                   name=prefix)(linear_features)
    return linear_logit

3.deep侧

deep侧其实就是一个DNN模型,但是在实现dnn模型之前,我们得先实现sparse_feat的embedding化

def get_sparse_emb_list(features,feature_columns,l2_emb_reg,prefix='sparse_'):
    sparse_feature_columns = list(
        filter(lambda x: isinstance(x, SparseFeat), feature_columns)) if feature_columns else []
    sparse_emb_list = []
    for feat in sparse_feature_columns:
        emb = Embedding(feat.vocabulary_size, feat.embedding_dim,
                        embeddings_initializer=feat.embeddings_initializer,
                        embeddings_regularizer=tf.keras.regularizers.l2(l2_emb_reg),
                        name=prefix + '_emb_' + feat.embedding_name,
                        trainable=feat.trainable)(features[feat.name])
        sparse_emb_list.append(emb)
    return sparse_emb_list

DNN代码如下:

class DNN(Layer):
    """The Multi Layer Percetron

      Input shape
        - nD tensor with shape: ``(batch_size, ..., input_dim)``. The most common situation would be a 2D input with shape ``(batch_size, input_dim)``.

      Output shape
        - nD tensor with shape: ``(batch_size, ..., hidden_size[-1])``. For instance, for a 2D input with shape ``(batch_size, input_dim)``, the output would have shape ``(batch_size, hidden_size[-1])``.

      Arguments
        - **hidden_units**:list of positive integer, the layer number and units in each layer.

        - **activation**: Activation function to use.

        - **l2_reg**: float between 0 and 1. L2 regularizer strength applied to the kernel weights matrix.

        - **dropout_rate**: float in [0,1). Fraction of the units to dropout.

        - **use_bn**: bool. Whether use BatchNormalization before activation or not.

        - **seed**: A Python integer to use as random seed.
    """

    def __init__(self, hidden_units, activation='relu', l2_reg=0, dropout_rate=0, use_bn=False, seed=1024, **kwargs):
        self.hidden_units = hidden_units
        self.activation = activation
        self.dropout_rate = dropout_rate
        self.seed = seed
        self.l2_reg = l2_reg
        self.use_bn = use_bn
        super(DNN, self).__init__(**kwargs)

    def build(self, input_shape):
        # if len(self.hidden_units) == 0:
        #     raise ValueError("hidden_units is empty")
        input_size = input_shape[-1]
        hidden_units = [int(input_size)] + list(self.hidden_units)
        self.kernels = [self.add_weight(name='kernel' + str(i),
                                        shape=(
                                            hidden_units[i], hidden_units[i + 1]),
                                        initializer=glorot_normal(
                                            seed=self.seed),
                                        regularizer=l2(self.l2_reg),
                                        trainable=True) for i in range(len(self.hidden_units))]
        self.bias = [self.add_weight(name='bias' + str(i),
                                     shape=(self.hidden_units[i],),
                                     initializer=Zeros(),
                                     trainable=True) for i in range(len(self.hidden_units))]
        if self.use_bn:
            self.bn_layers = [tf.keras.layers.BatchNormalization() for _ in range(len(self.hidden_units))]

        self.dropout_layers = [tf.keras.layers.Dropout(self.dropout_rate, seed=self.seed + i) for i in
                               range(len(self.hidden_units))]

        self.activation_layers = [Activation(self.activation) for _ in range(len(self.hidden_units))]

        super(DNN, self).build(input_shape)  # Be sure to call this somewhere!

    def call(self, inputs, training=None, **kwargs):

        deep_input = inputs

        for i in range(len(self.hidden_units)):
            fc = tf.nn.bias_add(tf.tensordot(
                deep_input, self.kernels[i], axes=(-1, 0)), self.bias[i])
            # fc = Dense(self.hidden_size[i], activation=None, \
            #           kernel_initializer=glorot_normal(seed=self.seed), \
            #           kernel_regularizer=l2(self.l2_reg))(deep_input)
            if self.use_bn:
                fc = self.bn_layers[i](fc, training=training)

            fc = self.activation_layers[i](fc)

            fc = self.dropout_layers[i](fc, training=training)
            deep_input = fc

        return deep_input

    def compute_output_shape(self, input_shape):
        if len(self.hidden_units) > 0:
            shape = input_shape[:-1] + (self.hidden_units[-1],)
        else:
            shape = input_shape

        return tuple(shape)

    def get_config(self, ):
        config = {'activation': self.activation, 'hidden_units': self.hidden_units,
                  'l2_reg': self.l2_reg, 'use_bn': self.use_bn, 'dropout_rate': self.dropout_rate, 'seed': self.seed}
        base_config = super(DNN, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))
    sparse_emb = get_sparse_emb_list(features_inputs,linear_feature_columns,l2_reg_embedding,prefix='sparse_')


    # DNN
    dense_input = get_dense_input(features_inputs, dnn_feature_columns)
    dense_input = Concatenate(axis=1)(dense_input)
    dnn_input = tf.concat([Flatten()(Concatenate(axis=1)(sparse_emb)),dense_input],axis=1)
    dnn_output = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout,
                     dnn_use_bn, seed)(dnn_input)
    dnn_logit = Dense(1,activation='sigmoid',name='dnn_logit',
                       kernel_regularizer=tf.keras.regularizers.l2(l2_reg_dnn),
                       use_bias=False)(dnn_output)

4. wide + deep

    out_put =Add()([linear_logit,dnn_logit])
    out_put = Activation('sigmoid')(out_put)
    model = tf.keras.models.Model(inputs=inputs_list, outputs=out_put)

至此,模型便是构建完成了

5. 模型构建的完整代码

def Wide_Deep(linear_feature_columns, dnn_feature_columns, dnn_hidden_units=(128,128,64,32),
           l2_reg_linear=0.00001, l2_reg_embedding=0.00001, l2_reg_dnn=0, seed=1024, dnn_dropout=0,
           dnn_activation='relu', dnn_use_bn=True, task='binary'):
    """Instantiates the wide&deep Network architecture.

    :param linear_feature_columns: An iterable containing all the features used by linear part of the model.
    :param dnn_feature_columns: An iterable containing all the features used by deep part of the model.
    :param fm_group: list, group_name of features that will be used to do feature interactions.
    :param dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of DNN
    :param l2_reg_linear: float. L2 regularizer strength applied to linear part
    :param l2_reg_embedding: float. L2 regularizer strength applied to embedding vector
    :param l2_reg_dnn: float. L2 regularizer strength applied to DNN
    :param seed: integer ,to use as random seed.
    :param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate.
    :param dnn_activation: Activation function to use in DNN
    :param dnn_use_bn: bool. Whether use BatchNormalization before activation or not in DNN
    :param task: str, ``"binary"`` for  binary logloss or  ``"regression"`` for regression loss
    :return: A Keras model instance.
    """

    features_inputs = build_input_features(linear_feature_columns + dnn_feature_columns)

    inputs_list = list(features_inputs.values())

    # LR
    linear_logit = get_linear_logit(features_inputs, linear_feature_columns, seed=seed, prefix='linear',
                                    l2_reg=l2_reg_linear)

    sparse_emb = get_sparse_emb_list(features_inputs,linear_feature_columns,l2_reg_embedding,prefix='sparse_')


    # DNN
    dense_input = get_dense_input(features_inputs, dnn_feature_columns)
    dense_input = Concatenate(axis=1)(dense_input)
    dnn_input = tf.concat([Flatten()(Concatenate(axis=1)(sparse_emb)),dense_input],axis=1)
    dnn_output = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout,
                     dnn_use_bn, seed)(dnn_input)
    dnn_logit = Dense(1,activation='sigmoid',name='dnn_logit',
                       kernel_regularizer=tf.keras.regularizers.l2(l2_reg_dnn),
                       use_bias=False)(dnn_output)


    # concat and Activation
    out_put =Add()([linear_logit,dnn_logit])
    out_put = Activation('sigmoid')(out_put)
    model = tf.keras.models.Model(inputs=inputs_list, outputs=out_put)
    return model

数据+模型运行

import pandas as pd
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from models import DeepFM,Wide_Deep,DCN,AutoInt,NFM,xDeepFM,AFM
from feature_columns import SparseFeat, DenseFeat
from utils import get_feature_names

if __name__ == "__main__":
    data = pd.read_csv('../data/criteo_sample_data.csv',nrows=1000)

    sparse_features = ['C' + str(i) for i in range(1, 27)]
    dense_features = ['I' + str(i) for i in range(1, 14)]

    data[sparse_features] = data[sparse_features].fillna('-1')
    data[dense_features] = data[dense_features].fillna(0)
    target = ['label']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name
    import tensorflow as tf
    import numpy as np
    fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=8,embeddings_initializer=tf.initializers.identity(np.zeros([data[feat].nunique(),8])))
                           for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                          for feat in dense_features]

    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

    print("feature_names",feature_names)
    # 3.generate input data for model

    train, test = train_test_split(data, test_size=0.2,random_state=2018)
    train_model_input = {name:train[name] for name in feature_names}
    test_model_input = {name:test[name] for name in feature_names}

    # 4.Define Model,train,predict and evaluate
    model = Wide_Deep(linear_feature_columns, dnn_feature_columns, task='binary')
    model.compile("adam", "binary_crossentropy",
                  metrics=['binary_crossentropy'], )

    history = model.fit(train_model_input, train[target].values,
                        batch_size=128, epochs=10, verbose=2, validation_data=[test_model_input,test[target].values])
    pred_ans = model.predict(test_model_input, batch_size=256)
    print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
    print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 4
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值