"""
Created on
train MM model
@author:
"""
import json
import os
import pandas as pd
import tensorflow as tf
from time import time
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.optimizers import Adam
import multiprocessing
from model import DNN
from evaluate import *
from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
from utils import *
from fromTFrecords import *
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
if __name__ == '__main__':
# =============================== GPU ==============================
# gpu = tf.config.experimental.list_physical_devices(device_type='GPU')
# print(gpu)
os.environ["TF_CONFIG"] = json.dumps({
"cluster": {
"worker": ["10.10.63.181:2345", "10.10.63.182:2345"],
"ps": ["10.10.63.183:2345"]
},
"task": {"type": "ps", "index": 0}
})
cluster_spec = tf.train.ClusterSpec({
"ps": ["10.10.63.183:2345"],
"worker": ["10.10.63.181:2345", "10.10.63.182:2345"],
"chief": ["10.10.63.181:2346"]
})
cluster_resolver = SimpleClusterResolver(cluster_spec, task_type="ps",task_id=0)
# Workers need some inter_ops threads to work properly.
worker_config = tf.compat.v1.ConfigProto()
if multiprocessing.cpu_count() < 2 + 1:
worker_config.inter_op_parallelism_threads = 2 + 1
if cluster_resolver.task_type in ("worker"):
# Start a TensorFlow server and wait.
tf.distribute.Server(
cluster_spec,
job_name="worker",
task_index=cluster_resolver.task_id,
config=worker_config,
protocol="grpc"
)
elif cluster_resolver.task_type == "ps":
tf.distribute.Server(
cluster_spec,
job_name="ps",
task_index=cluster_resolver.task_id,
protocol="grpc"
)
# Run side-car evaluation
else:
tf.distribute.Server(
cluster_spec,
job_name="chief",
task_index=cluster_resolver.task_id,
protocol="grpc"
)
# Run the coordinator.
# ========================= Hyper Parameters =======================
file = '../dataset/ml-1m/ratings.dat'
# train_dir = "C:\\tmp\\tfrecord-dnn\\train"
# val_dir = "C:\\tmp\\tfrecord-dnn\\val"
# test_dir = "C:\\tmp\\tfrecord-dnn\\test"
# Linux本地文件
train_dir = "/bigdata/data/tfrecord-dnn/train"
val_dir = "/bigdata/data/tfrecord-dnn/val"
test_dir = "/bigdata/data/tfrecord-dnn/test"
# test_dir2 = "C:\\tmp\\tfrecord-dnn\\ntest"
trans_score = 1
K = 10
learning_rate = 0.001
epochs = 3
batch_size = 512
input_context = tf.distribute.InputContext(1,0,1)
# ========================== Create dataset =======================
train = get_trainORval_data(train_dir, batch_size)
# val = get_trainORval_data(val_dir, batch_size)
# test = get_test_data(test_dir, batch_size)
# test2 = get_test_data(test_dir2, batch_size)
def dataset_fn_train(input_context):
global_batch_size = 512
batch_size = input_context.get_per_replica_batch_size(global_batch_size)
dataset = get_trainORval_data(train_dir, batch_size)
dataset = dataset.shard(
input_context.num_input_pipelines, input_context.input_pipeline_id)
dataset = dataset.batch(batch_size)
dataset = dataset.prefetch(2)
return dataset
def dataset_fn_val(input_context):
global_batch_size = 512
batch_size = input_context.get_per_replica_batch_size(global_batch_size)
dataset = get_trainORval_data(val_dir, batch_size)
dataset = dataset.shard(
input_context.num_input_pipelines, input_context.input_pipeline_id)
dataset = dataset.batch(batch_size)
dataset = dataset.prefetch(2)
return dataset
def dataset_fn_test(input_context):
global_batch_size = 512
batch_size = input_context.get_per_replica_batch_size(global_batch_size)
dataset = get_test_data(test_dir, batch_size)
dataset = dataset.shard(
input_context.num_input_pipelines, input_context.input_pipeline_id)
dataset = dataset.batch(batch_size)
dataset = dataset.prefetch(2)
return dataset
input_train = tf.keras.utils.experimental.DatasetCreator(dataset_fn_train)
input_val = tf.keras.utils.experimental.DatasetCreator(dataset_fn_val)
input_test = tf.keras.utils.experimental.DatasetCreator(dataset_fn_test)
# a_col, train, val, test = create_implicit_ml_1m_dataset(file, trans_score, embed_dim, maxlen)
# train_X, train_y = train
# val_X, val_y = val
strategy = tf.distribute.experimental.ParameterServerStrategy(cluster_resolver)
with strategy.scope():
maxlen = 100
embed_dim = 64
hidden_unit = 128
embed_reg = 1e-6 # 1e-6
activation = 'relu'
item_num = 3953
item_feat_col = sparseFeature('item_id', item_num, embed_dim)
# ============================Build Model==========================
model = DNN(item_feat_col, maxlen, hidden_unit, activation, embed_reg)
model.summary()
# =========================Compile============================
model.compile(loss=binary_crossentropy, optimizer=Adam(learning_rate=learning_rate))
#strategy.run(model)
results = []
working_dir = '/tmp/my_working_dir'
log_dir = os.path.join(working_dir, 'log')
ckpt_filepath = os.path.join(working_dir, 'ckpt')
backup_dir = os.path.join(working_dir, 'backup')
callbacks = [
tf.keras.callbacks.TensorBoard(log_dir=log_dir),
tf.keras.callbacks.ModelCheckpoint(filepath=ckpt_filepath),
tf.keras.callbacks.experimental.BackupAndRestore(backup_dir=backup_dir),
]
t1 = time()
model.fit(input_train,epochs=3,steps_per_epoch=10,callbacks=callbacks)
t2 = time()
hit_rate, ndcg = evaluate_model(model, input_test, K)
print('Iteration %d Fit [%.1f s], Evaluate [%.1f s]: HR = %.4f, NDCG = %.4f'
% (epochs, t2 - t1, time() - t2, hit_rate, ndcg))
results.append([epochs + 1, t2 - t1, time() - t2, hit_rate, ndcg])
# ============================Write============================
pd.DataFrame(results, columns=['Iteration', 'fit_time', 'evaluate_time', 'hit_rate', 'ndcg']). \
to_csv(
'/bigdata/data/log/DNN_log_maxlen_{}_dim_{}_hidden_unit_{}.csv'.format(maxlen, embed_dim, hidden_unit),
index=False)
# for epoch in range(1, epochs + 1):
# # ===========================Fit==============================
# t1 = time()
# model.fit(
# input_train,
# #validation_data=input_val,
# epochs=3,
# steps_per_epoch=100,
# callbacks=callbacks,
# )
# #tf.saved_model.save(model, "saved/1")
# # ===========================Test==============================
# t2 = time()
# if epoch % 5 == 0:
# hit_rate, ndcg = evaluate_model(model, input_test, K)
# print('Iteration %d Fit [%.1f s], Evaluate [%.1f s]: HR = %.4f, NDCG = %.4f'
# % (epoch, t2 - t1, time() - t2, hit_rate, ndcg))
# results.append([epoch + 1, t2 - t1, time() - t2, hit_rate, ndcg])
# # ============================Write============================
# pd.DataFrame(results, columns=['Iteration', 'fit_time', 'evaluate_time', 'hit_rate', 'ndcg']). \
# to_csv(
# '/bigdata/data/log/DNN_log_maxlen_{}_dim_{}_hidden_unit_{}.csv'.format(maxlen, embed_dim, hidden_unit),
# index=False)
2021-08-25
最新推荐文章于 2024-01-04 05:30:00 发布