tensorflow 实现 ESMM

input_fn.py

#-*- coding: UTF-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf


FixedLenFeatureColumns=["label", "user_id", "creative_id", "has_target", "terminal",
        "hour", "weekday","template_category",
        "day_user_show", "day_user_click", "city_code","network_type"]
StringVarLenFeatureColumns = ["keyword"]  #特征长度不固定
FloatFixedLenFeatureColumns = ['creative_history_ctr']
StringFixedLenFeatureColumns = ["keyword_attention"]
StringFeatureColumns = ["device_type", "device_model", "manufacturer"]

DayShowSegs = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 37, 38, 39, 41, 42, 44, 46, 47, 49, 51, 54, 56, 59, 61, 65, 68, 72, 76, 81, 86, 92, 100, 109, 120, 134, 153, 184, 243, 1195]
DayClickSegs = [1, 2, 3, 6, 23]


def build_model_columns():
    """Builds a set of wide and deep feature columns."""
    # Continuous variable columns
    # hours_per_week = tf.feature_column.numeric_column('hours_per_week')

    creative_id = tf.feature_column.categorical_column_with_hash_bucket(
        'creative_id', hash_bucket_size=200000, dtype=tf.int64)
    # To show an example of hashing:
    has_target = tf.feature_column.categorical_column_with_identity(
        'has_target', num_buckets=3)
    terminal = tf.feature_column.categorical_column_with_identity(
        'terminal', num_buckets=10)
    hour = tf.feature_column.categorical_column_with_identity(
        'hour', num_buckets=25)
    weekday = tf.feature_column.categorical_column_with_identity(
        'weekday', num_buckets=10)
    day_user_show = tf.feature_column.bucketized_column(
        tf.feature_column.numeric_column('day_user_show', dtype=tf.int32), boundaries=DayShowSegs)
    day_user_click = tf.feature_column.bucketized_column(
        tf.feature_column.numeric_column('day_user_click', dtype=tf.int32), boundaries=DayClickSegs)

    city_code = tf.feature_column.categorical_column_with_hash_bucket(
        'city_code', hash_bucket_size=2000, dtype=tf.int64)

    network_type = tf.feature_column.categorical_column_with_identity(
        'network_type', num_buckets=20, default_value=19)

    device_type = tf.feature_column.categorical_column_with_hash_bucket(   #androidphone这些
        'device_type', hash_bucket_size=500000, dtype=tf.string
    )
    device_model = tf.feature_column.categorical_column_with_hash_bucket(  #型号如iPhone10  vivo X9
        'device_model', hash_bucket_size=200000, dtype=tf.string
    )
    manufacturer = tf.feature_column.categorical_column_with_hash_bucket(  #手机品牌 vivo iphone等
        'manufacturer', hash_bucket_size=50000, dtype=tf.string
    )


    deep_columns = [
        tf.feature_column.embedding_column(creative_id, dimension=15,combiner='sum'),
        tf.feature_column.embedding_column(has_target, dimension=15,combiner='sum'),
        tf.feature_column.embedding_column(terminal, dimension=15, combiner='sum'),
        tf.feature_column.embedding_column(hour, dimension=15, combiner='sum'),
        tf.feature_column.embedding_column(weekday, dimension=15, combiner='sum'),
        tf.feature_column.embedding_column(day_user_show, dimension=15, combiner='sum'),
        tf.feature_column.embedding_column(day_user_click, dimension=15, combiner='sum'),
        tf.feature_column.embedding_column(city_code, dimension=15, combiner='sum'),
        tf.feature_column.embedding_column(network_type, dimension=15, combiner='sum'),
        tf.feature_column.embedding_column(device_type, dimension=15, combiner='sum'),
        tf.feature_column.embedding_column(device_model, dimension=15, combiner='sum'),
        tf.feature_column.embedding_column(manufacturer, dimension=15, combiner='sum'),

    ]
    # base_columns = [user_id, ad_id, creative_id,  product_id, brush_num, terminal,terminal_brand]
    '''
    crossed_columns = [tf.feature_column.crossed_column(
                            ['userId', 'adId'], hash_bucket_size = 50000000),
                      、、、
                      ]
    '''
    return deep_columns

def feature_input_fn(data_file, num_epochs, shuffle, batch_size, labels=True):
  """Generate an input function for the Estimator."""

  def parse_tfrecord(value):
    tf.logging.info('Parsing {}'.format(data_file[:10]))
    FixedLenFeatures = {
        key: tf.FixedLenFeature(shape=[1], dtype=tf.int64) for key in FixedLenFeatureColumns
    }

    features={}
    features.update(FixedLenFeature)    
    fea = tf.parse_example(value, features)
    feature = {
        key: fea[key] for key in features
    }
    classes = tf.to_float(feature['label'])
    return feature, classes

  # Extract lines from input files using the Dataset API.
  filenames = tf.data.Dataset.list_files(data_file)
  dataset = filenames.apply(tf.contrib.data.parallel_interleave(
      lambda filename: tf.data.TFRecordDataset(filename),
      cycle_length=32))

  if shuffle:
    dataset = dataset.shuffle(buffer_size=batch_size*64)

  dataset = dataset.repeat(num_epochs).batch(batch_size).prefetch(buffer_size=batch_size*8)
  dataset = dataset.map(parse_tfrecord, num_parallel_calls=32)

  return dataset

esmm.py

# -*- coding: UTF-8 -*-
import tensorflow as tf
from tensorflow.python.estimator.canned import head as head_lib
from tensorflow.python.ops.losses import losses

def build_deep_layers(net, params):
    # Build the hidden layers, sized according to the 'hidden_units' param.

    for num_hidden_units in params['hidden_units']:
        net = tf.layers.dense(net, units=num_hidden_units, activation=tf.nn.relu,
                              kernel_initializer=tf.glorot_uniform_initializer())
    return net

def esmm_model_fn(features, labels, mode, params):
    net = tf.feature_column.input_layer(features, params['feature_columns'])
    last_ctr_layer = build_deep_layers(net, params)
    last_cvr_layer = build_deep_layers(net, params)

    # head = tf.contrib.estimator.binary_classification_head(loss_reduction=losses.Reduction.SUM)
    head = head_lib._binary_logistic_or_multi_class_head(  # pylint: disable=protected-access
        n_classes=2, weight_column=None, label_vocabulary=None, loss_reduction=losses.Reduction.SUM)
    ctr_logits = tf.layers.dense(last_ctr_layer, units=head.logits_dimension,
                                 kernel_initializer=tf.glorot_uniform_initializer())
    cvr_logits = tf.layers.dense(last_cvr_layer, units=head.logits_dimension,
                                 kernel_initializer=tf.glorot_uniform_initializer())
    ctr_preds = tf.sigmoid(ctr_logits)
    cvr_preds = tf.sigmoid(cvr_logits)
    ctcvr_preds = tf.multiply(ctr_preds, cvr_preds)

    optimizer = tf.train.AdagradOptimizer(learning_rate=params['learning_rate'])
    ctr_label = labels['ctr_label']
    cvr_label = labels['cvr_label']

    user_id = features['user_id']
    click_label = features['label']
    conversion_label = features['is_conversion']

    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'ctr_preds': ctr_preds,
            'cvr_preds': cvr_preds,
            'ctcvr_preds': ctcvr_preds,
            'user_id': user_id,
            'click_label': click_label,
            'conversion_label': conversion_label
        }
        export_outputs = {
            'regression': tf.estimator.export.RegressionOutput(predictions['cvr_preds'])  # 线上预测需要的
        }
        return tf.estimator.EstimatorSpec(mode, predictions=predictions, export_outputs=export_outputs)

    else:
        ctr_loss = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(labels=ctr_label, logits=ctr_logits))
        ctcvr_loss = tf.reduce_sum(tf.losses.log_loss(labels=cvr_label, predictions=ctcvr_preds))
        loss = ctr_loss + ctcvr_loss  # loss这儿可以加一个参数,参考multi-task损失的方法

        train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)

train.py

#-*- coding: UTF-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import random
from esmm import *
from input_fn import *
from sklearn.metrics import roc_auc_score
from metric import cal_group_auc, cross_entropy_loss
import numpy as np
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'  #指定用哪块GPU

'''
A tensorflow implementation of ESMM
Entire Space Multi-Task Model: An Effective Approach for Estimating Post-Click Conversion Rate(SIGIR 18)
@author: Qiao
'''

flags = tf.app.flags
flags.DEFINE_string("model_dir", "./models", "Base directory for the model")
# 特征样本换成自己的
flags.DEFINE_string("train_data_dir", "/trainSamples/{20181226,20181227,20181228,20181229,20181230,20181231,20190101}/v1_tfrecord/", "dir for training data")
flags.DEFINE_string("eval_data_dir", "/testSamples/20190102/v1_tfrecord/", "dir for evaluation data")
flags.DEFINE_integer("batch_size", 512, "Training batch size")
flags.DEFINE_integer(name="num_epochs", short_name="ne", default=2, help="Training num epochs")
flags.DEFINE_float("learning_rate", 0.03, "Learning rate")
flags.DEFINE_string("hidden_units", "512,256,128", "number of units in each hidden layer for NN")
flags.DEFINE_integer("num_cross_layers", 4, "Number of cross layers")
flags.DEFINE_integer("save_checkpoints_steps", 20000, "Save checkpoints every steps")
flags.DEFINE_string("export_dir", "./exportmodels", "Path for exportmodels")
flags.DEFINE_boolean(name="evaluate_only", short_name="eo", default=False, help="evaluate only flag")
flags.DEFINE_boolean(name="use_cross", default=True, help="whether use cross layer")
flags.DEFINE_integer("predict_steps", 6000, "predict_steps*batch_size samples to evaluate")
FLAGS = flags.FLAGS

def export_model(model, export_dir, model_column_fn):
  """Export to SavedModel format.
  Args:
    model: Estimator object
    export_dir: directory to export the model.
    model_column_fn: Function to generate model feature columns.
  """
  columns = model_column_fn
  columns.append(tf.feature_column.numeric_column("user_id", default_value=123456, dtype=tf.int64))
  columns.append(tf.feature_column.numeric_column("click_label", default_value=0, dtype=tf.int64))
  columns.append(tf.feature_column.numeric_column("conversion_label", default_value=0, dtype=tf.int64))
  feature_spec = tf.feature_column.make_parse_example_spec(columns)
  example_input_fn = (
      tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec))
  model.export_savedmodel(export_dir, example_input_fn)

def list_hdfs_dir(path):
    files = []
    sample_dir = "hdfs://yourhdfs" + path + "part*"
    sample_dir_script = "hadoop fs -ls " + sample_dir + " | awk  -F ' '  '{print $8}'"
    for dir_path in os.popen(sample_dir_script).readlines():
        dir_path = dir_path.strip()
        files.append(dir_path)
    return files

def model_predict(model, eval_input_fn, epoch):
    """Display evaluate result."""
    prediction_result = model.predict(eval_input_fn)

    ctr_predictions = []
    cvr_predictions = []
    ctcvr_predictions = []
    user_id_list = []
    click_labels = []
    conversion_labels = []
    num_samples = FLAGS.batch_size * FLAGS.predict_steps
    print(num_samples)
    for pred_dict in prediction_result:
        # print(pred_dict)
        user_id = pred_dict['user_id'][0]
        ctr_preds = pred_dict['ctr_preds'][0]
        cvr_preds = pred_dict['cvr_preds'][0]
        ctcvr_preds = pred_dict['ctcvr_preds'][0]
        click_label = float(pred_dict['click_label'][0])
        conversion_label = float(pred_dict['conversion_label'][0])
        ctr_predictions.append(ctr_preds)
        cvr_predictions.append(cvr_preds)
        ctcvr_predictions.append(ctcvr_preds)
        user_id_list.append(user_id)
        click_labels.append(click_label)
        conversion_labels.append(conversion_label)

        if len(ctr_predictions) % (num_samples / 10) == 0:
            tf.logging.info('predict at step %d/%d', int(float(len(ctr_predictions)) / num_samples * FLAGS.predict_steps),
                            FLAGS.predict_steps)
        if len(ctr_predictions) >= num_samples:
            break

    #num_samples = len(predictions)
    # Display evaluation metrics
    # 过滤出点击的样本(click_label&!conversion_label为负样本,click&conversion_label为正样本),
    # 计算cvr的auc和gauc,变异系数等,等其他你自己想要的指标,可以参考下面的计算

    """
    label_mean = sum(labels) / num_samples
    prediction_mean = sum(predictions) / num_samples
    loss = sum(cross_entropy_loss(labels, predictions)) / num_samples * FLAGS.batch_size
    auc = roc_auc_score(labels, predictions)
    group_auc = cal_group_auc(labels, predictions, user_id_list)
    predict_diff = np.array(predictions) - prediction_mean
    predict_diff_square_sum = sum(np.square(predict_diff))
    s_deviation = np.sqrt(predict_diff_square_sum / num_samples)
    c_deviation = s_deviation / prediction_mean
    
    true_positive_samples = (np.array(predictions) * np.array(labels) >= 0.5).tolist().count(True)
    false_positive_samples = (np.array(predictions) * (1 - np.array(labels)) >= 0.5).tolist().count(True)
    print(true_positive_samples)
    print(false_positive_samples)
    # precision = float(true_positive_samples)/(true_positive_samples+false_positive_samples)
    precision = 0
    false_negative_samples = (np.array(predictions) * np.array(labels) < 0.5).tolist().count(True)
    recall = float(true_positive_samples) / (true_positive_samples + false_negative_samples)
    print(false_negative_samples)
    
    tf.logging.info('Results at epoch %d/%d', (epoch + 1), FLAGS.num_epochs)
    tf.logging.info('-' * 60)
    tf.logging.info('label/mean: %s' % label_mean)
    tf.logging.info('predictions/mean: %s' % prediction_mean)
    tf.logging.info('total loss average batchsize: %s' % loss)
    tf.logging.info('standard deviation: %s' % s_deviation)
    tf.logging.info('coefficient of variation: %s' % c_deviation)
    #tf.logging.info('precision: %s' % precision)
    #tf.logging.info('recall: %s' % recall)
    tf.logging.info('auc: %s' % auc)
    tf.logging.info('group auc: %s' % group_auc)
    """

def main(unused_argv):
  train_files = []
  eval_files = []
  if isinstance(FLAGS.train_data_dir, str):
      train_files = list_hdfs_dir(FLAGS.train_data_dir)

  if isinstance(FLAGS.eval_data_dir, str):
      eval_files = list_hdfs_dir(FLAGS.eval_data_dir)

  random.shuffle(train_files)
  feature_columns = build_model_columns()

  session_config = tf.ConfigProto(device_count={'GPU': 1, 'CPU': 10},
                                  inter_op_parallelism_threads=10,
                                  intra_op_parallelism_threads=10
                                  # log_device_placement=True
                                  )
  session_config.gpu_options.per_process_gpu_memory_fraction = 0.32
  run_config = tf.estimator.RunConfig().replace(
      model_dir=FLAGS.model_dir,session_config=session_config, log_step_count_steps=1000, save_summary_steps=20000, save_checkpoints_secs=1000)

  model = tf.estimator.Estimator(
    model_fn=esmm_model_fn,
    params={
      'feature_columns': feature_columns,
      'hidden_units': FLAGS.hidden_units.split(','),
      'learning_rate': FLAGS.learning_rate,
      'num_cross_layers': FLAGS.num_cross_layers,
      'use_cross': FLAGS.num_cross_layers
    },
    config = run_config
  )
  train_input_fn = lambda: feature_input_fn(train_files, 1, True, FLAGS.batch_size)
  eval_input_fn = lambda: feature_input_fn(eval_files, 1, False, FLAGS.batch_size)  # not shuffle for evaluate
  
  #model_predict(model,eval_input_fn,0)
  for epoch in range(FLAGS.num_epochs):
      if FLAGS.evaluate_only == False:
          model.train(train_input_fn, steps=6000)
      print("*" * 100)
      model_predict(model,eval_input_fn,epoch)



  # Export the model
  if FLAGS.export_dir is not None:
      export_model(model, FLAGS.export_dir, feature_columns)


if __name__ == "__main__":
  tf.logging.set_verbosity(tf.logging.INFO)
  tf.app.run(main=main)

  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值