pretrain:
nsp任务,调用 cls 进行训练优化
mask任务,调用sequence_embedding进行训练优化
bert+dynamic_rnn-states+fc
fc->[residual_layers,fc]
bert transformer+fc->重新训练 为想输出的向量维度。
bert+submodel 架构, infer:bet提供embedding,作为其它子模型的输入。
train:stop_gradient,基于bert 统一参加训练
train:预先 通过basemodel获取所有label,embedding,转成dataset,训练时候直接作为feature输入。
arcface_loss 作为一个loss,适用于 embedding的训练
reduce_sum(one_hot_labels*logs_softmax)
网络结构内部计算 cosine_similarity,matul,输出
multitask-classification: fc->每个task->loss tf.add_n(loss),根据 key标记每个prob/loss
可以自定义一些参数:qa_type_ids,作为diy_embedding训练时候加入 input_ids_tensor
neural-network 内部可以进行sim_scores的计算
loss
probabilities = tf.nn.softmax(logits, axis=-1)
log_probs = tf.nn.log_softmax(logits, axis=-1)
one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
loss = tf.reduce_mean(per_example_loss)
bert的 multihead 在代码中的体现,一开始是平行计算,当做one-head进行处理,后期函数转化,拆分成 multihead
class EvalCheckpointSaverListener(tf.train.CheckpointSaverListener):
def __init__(self, estimator, input_fn):
self.estimator = estimator
self.input_fn = input_fn
self.eval_loss = 999
self.checkpoint = []
self.top_k_model =2
self.best_step= 0
self.min_test_steps = 200
def _copy_best_model(self):
# remove previously stored model files
model_files = glob.glob(os.path.join(best_model_dir, "model.ckpt-*.*"))
update_best_model = False
for fname in glob.glob(os.path.join(model_dir, "model.ckpt-%d.*" % (self.best_step))):
src_file = fname
# import pdb; pdb.set_trace()
des_file = os.path.join(best_model_dir,fname.split('/')[-1])
if os.path.exists(fname) and not os.path.exists(des_file):
tf.logging.info(" copying model file %s",des_file)
# copy model files and eval_results.txt to best_nodel diretory
copyfile(src_file,des_file)
best_loss_text_file = os.path.join(best_model_dir,'eval_results.txt')
with open(best_loss_text_file,'w') as f:
f.write('eval loss : {} \n'.format(self.eval_loss))
f.write('best step: {} \n'.format(self.best_step))
tf.logging.info('best eval loss : {}'.format(self.eval_loss))
tf.logging.info('best step : {}'.format(self.best_step))
update_best_model = True
# copyfile(
# os.path.join(model_dir,'eval_results.txt'),
# os.path.join(best_model_dir,'eval_results.txt'),
# )
if update_best_model:
for file in model_files:
os.remove(file)
def _update_checkpoint_file(self):
check_point_file = os.path.join(model_dir, "checkpoint")
with open(check_point_file,'r') as f:
docs = [line for line in f]
newest_model = 'model.ckpt-{}'.format(self.best_step)
docs[0]='model_checkpoint_path: "{}"'.format(newest_model)
with open(check_point_file,'w') as f:
for line in docs:
if '\n' not in line:
line +='\n'
f.write(line )
def after_save(self, session, global_step):
if hvd.rank() == 0 and global_step > self.min_test_steps:
evaluation = self.estimator.evaluate(self.input_fn)
num_model = len(glob.glob(os.path.join(model_dir, "model.ckpt-*.index")))
tf.logging.info("model dir is in %s", os.path.join(model_dir, "model.ckpt-*.index"))
tf.logging.info("there are %d model files [1]", num_model)
# if len(self.checkpoint) < self.top_k_model:
# tf.logging.info("push checkpoint %d into heap with loss of %f", global_step,-evaluation['loss'])
# heapq.heappush(self.checkpoint, (-evaluation['loss'], global_step))
# elif self.checkpoint[0][0] <= -evaluation['loss']:
# min_step = heapq.heappop(self.checkpoint)[1]
# num_model = len(glob.glob(os.path.join(model_dir, "model.ckpt-*.index")))
# tf.logging.info("there are %d model files", num_model)
# # if len(glob.glob(os.path.join(model_dir, "model.ckpt-*.index"))) > self.top_k_model:
# # for fname in glob.glob(os.path.join(model_dir, "model.ckpt-%d.*" % (min_step))):
# # tf.logging.info(" deleting trained model file %s", fname)
# # os.remove(fname)
# heapq.heappush(self.checkpoint, (-evaluation['loss'], global_step))
# else:
# if len(glob.glob(os.path.join(model_dir, "model.ckpt-*.index"))) > self.top_k_model:
# for fname in glob.glob(os.path.join(model_dir, "model.ckpt-%d.*" % (global_step))):
# tf.logging.info(" deleting most recently saved trained model file %s", fname)
# os.remove(fname)
if evaluation['loss'] < self.eval_loss:
self.eval_loss, self.best_step = evaluation['loss'], global_step
# self._update_checkpoint_file()
self._copy_best_model()
# Scalar dimensions referenced here:
# B = batch size (number of sequences)
# F = `from_tensor` sequence length
# T = `to_tensor` sequence length
# N = `num_attention_heads`
# H = `size_per_head`
from_tensor_2d = reshape_to_matrix(from_tensor)
to_tensor_2d = reshape_to_matrix(to_tensor)
# `query_layer` = [B*F, N*H]
query_layer = tf.layers.dense(
from_tensor_2d, #[batch*seqlen,width]
num_attention_heads * size_per_head, #12*64
activation=query_act,
name="query",
kernel_initializer=create_initializer(initializer_range))
# `key_layer` = [B*T, N*H]
key_layer = tf.layers.dense(
to_tensor_2d,
num_attention_heads * size_per_head,
activation=key_act,
name="key",
kernel_initializer=create_initializer(initializer_range))
# `value_layer` = [B*T, N*H]
value_layer = tf.layers.dense(
to_tensor_2d,
num_attention_heads * size_per_head,
activation=value_act,
name="value",
kernel_initializer=create_initializer(initializer_range))
# `query_layer` = [B, N, F, H] [batch, 12,seq,64 ]
query_layer = transpose_for_scores(query_layer, batch_size,
num_attention_heads, from_seq_length,
size_per_head)
# `key_layer` = [B, N, T, H]
key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads,
to_seq_length, size_per_head)
# Take the dot product between "query" and "key" to get the raw
# attention scores.
# `attention_scores` = [B, N, F, T]
attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
attention_scores = tf.multiply(attention_scores,
1.0 / math.sqrt(float(size_per_head)))