理论
应用
#encoding=utf8
import tensorflow as tf
import pandas as pd
import argparse
import numpy as np
import gzip
import os
import sys
def parse_arg():
parser = argparse.ArgumentParser("Training for FTRL Ctr model.")
parser.add_argument(
"--train_data_dir",
type=str,
required=True,
help="The path of training data.")
parser.add_argument(
"--batch_size",
type=int,
default=10,
help="The number of batch size.")
parser.add_argument(
"--feature_num",
type=int,
default=14,
help="The number of features.")
return parser.parse_args()
def smart_open(file_name):
"""
@Brief: 适配打开.gz文件
"""
try:
if file_name[-3:] == ".gz":
return gzip.open(file_name, 'r')
else:
return open(file_name, 'r')
except Exception as e:
print("smart open file_name: {0} error: {1}".format(file_name,e), file=sys.stderr)
return None
def one_hot_feature(feature_list):
m = len(feature_list)
n = len(feature_list[0])
sign_dict_list = [{}] * n
index_list = [0] * n
for i in range(m):
for j in range(n):
#print(i,j,m,n,feature_list[i][j])
if feature_list[i][j] not in sign_dict_list[j].keys():
sign_dict_list[j][feature_list[i][j]] = index_list[j]
#print("index = {0} i= {1} j ={2} value = {3}".format(sign_dict_list[j][feature_list[i][j]], i, j, feature_list[i][j]))
index_list[j] += 1
ins_list = []
for i in range(m):
ins_i_sparse = []
pos_list = []
for j in range(n):
feature_j_sparse = [0.0] * len(sign_dict_list[j])
#print("index = {0}".format(sign_dict_list[j][feature_list[i][j]]))
feature_j_sparse[sign_dict_list[j][feature_list[i][j]]] = 1
ins_i_sparse += feature_j_sparse
pos_list.append(sign_dict_list[j][feature_list[i][j]])
ins_list.append(ins_i_sparse)
return np.array(ins_list)
def read_data(file_dir, feature_num):
labels = []
features = []
files = os.listdir(file_dir)
for file in files:
file = file_dir + "/" + file
if not os.path.isdir(file):
f = smart_open(file)
for line in f:
if "gz" in file:
data = line.decode().strip().split('\t')
else:
data = line.strip().split('\t')
label_feature = data[0].split(' ')
label = float(label_feature[1])
feature = list(map(lambda x: x.split(':')[0], label_feature[2:]))
labels.append([label])
if (len(feature) < feature_num):
continue
features.append(feature[:feature_num])
#print("len :{0}".format(len(feature)))
return labels, features
def model(feature, label):
n = feature.shape[1]
x = tf.placeholder(dtype = tf.float32, name = 'x', shape = [None, n])
y = tf.placeholder(dtype = tf.float32, name = 'y', shape = [None, 1])
W = tf.Variable(tf.truncated_normal([n, 1], stddev = 0.1))
bias = tf.Variable(tf.zeros([1]))
predict_y = tf.sigmoid(tf.matmul(x, W) + bias)
y_ = tf.sigmoid(tf.matmul(x, W) + bias)
loss = tf.reduce_mean(-tf.matmul(tf.transpose(y), tf.log(predict_y)) - tf.matmul(tf.transpose(1-y), tf.log(1-predict_y)))
opt = tf.train.FtrlOptimizer(0.03, l1_regularization_strength=0.01, l2_regularization_strength=0.01).minimize(loss)
auc_value, auc_op = tf.metrics.auc(labels=y,predictions=y_)
init = tf.group(tf.global_variables_initializer(),tf.local_variables_initializer())
with tf.Session() as sess:
sess.run(init)
for i in range(10):
_, _loss, predict_y_1, _y, _auc_op= sess.run([opt, loss, predict_y, y, auc_op], feed_dict = {x: feature, y: label})
print("pass: {0} loss = {1} auc = {2}".format(i, _loss, sess.run(auc_value)))
if __name__ == '__main__':
args = parse_arg()
labels, features = read_data(args.train_data_dir, args.feature_num)
features = one_hot_feature(features)
model(features, labels)
To Do
实现大规模离散LR的参数稀疏存储及更新