如何实现多GPU并发: goole提供的BERT源码使用TPUEstimator实现的,TPUEstimator支持多TPU并发但不支持多GPU并发。 为了能支持多GPU并发充分利用GPU资源提高效率,对BERT做一下修改: 1、使用optimization_multigpu.py替换optimization.py,执行一下命令 cp optimization.py optimization.bak cp optimization_multigpu.py optimization.py 2、修改训练代码,预训练代码在run_pretrain.py,句子分类run_classifier.py,Squad任务run_squad.py。 以Squad为例,其他文件可以用相同的方式修改。修改后的文件为run_squad_multigpu.py。 1) 添加num_gpus入参 flags.DEFINE_integer("num_gpus", 1, "Total number of GPUs to use.") 2)在model_fn_builder函数中修改EstimatorSpec,修改点有三处,修改成: a)添加scafflod: if init_checkpoint: (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) scaffold_fn = tf.train.Scaffold() b)修改Train的Spec: output_spec = tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold=scaffold_fn) c)修改predict的Spec: output_spec = tf.estimator.EstimatorSpec(mode=mode, predictions=predictions, scaffold=scaffold_fn) 3)在main函数创建RunConfig地方修改RunConfig,RunConfig添加分布式策略 distribute = tf.contrib.distribute.MirroredStrategy(num_gpus=FLAGS.num_gpus) run_config = tf.estimator.RunConfig( model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, train_distribute=distribute) 注意:这里可以删除TPUEstimator的RunConfig,也可以保留,保留的话添加分支判断。 4)修改main函数中调用model_fn_builder的地方,修改成: model_fn = model_fn_builder( bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=False, use_one_hot_embeddings=False) 5)main函数中将TPUEstimator修改成tf.estimator.Estimator params = {"batch_size": FLAGS.train_batch_size} estimator = tf.estimator.Estimator( model_fn=model_fn, config=run_config, params = params) 6)修改后可以不覆盖run_squad.py,直接使用run_squad_multigpu.py运行 python run_squad_multigpu.py --vocab_file=./uncased_L-12_H-768_A-12/vocab.txt --bert_config_file=./uncased_L-12_H-768_A-12/bert_config.json --init_checkpoint=./uncased_L-12_H-768_A-12/bert_model.ckpt --do_train=True --train_file=./data/train-v2.0.json --do_predict=True --predict_file=./data/dev-v2.0.json --learning_rate=3e-5 --num_train_epochs=2.0 --max_seq_length=384 --doc_stride=128 --train_batch_size=12 --num_gpus=2 --output_dir=./output/result_dir --version_2_with_negative=True 通过num_gpus指定GPU个数,train_batch_size修改batch_size,V100的显存大可以改成24,修改学习率learning_rate和num_train_epochs optimization_multigpu.py代码如下:
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Functions and classes related to optimization (weight updates)."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import re
import tensorflow as tf
from tensorflow.python.framework import ops
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import control_flow_ops
from tensorflow.python.ops import state_ops
from tensorflow.python.ops import resource_variable_ops
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
"""Creates an optimizer training op."""
global_step = tf.train.get_or_create_global_step()
learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
# Implements linear decay of the learning rate.
learning_rate = tf.train.polynomial_decay(
learning_rate,
global_step,
num_train_steps,
end_learning_rate=0.0,
power=1.0,
cycle=False)
# Implements linear warmup. I.e., if global_step < num_warmup_steps, the
# learning rate will be `global_step/num_warmup_steps * init_lr`.
if num_warmup_steps:
global_steps_int = tf.cast(global_step, tf.int32)
warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
global_steps_float = tf.cast(global_steps_int, tf.float32)
warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
warmup_percent_done = global_steps_float / warmup_steps_float
warmup_learning_rate = init_lr * warmup_percent_done
is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
learning_rate = (
(1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
# It is recommended that you use this optimizer for fine tuning, since this
# is how the model was trained (note that the Adam m/v variables are NOT
# loaded from init_checkpoint.)
optimizer = AdamWeightDecayOptimizer(
learning_rate=learning_rate,
weight_decay_rate=0.01,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-6,
exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
if use_tpu:
optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
tvars = tf.trainable_variables()
grads = tf.gradients(loss, tvars)
# This is how the model was pre-trained.
(grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
train_op = optimizer.apply_gradients(
zip(grads, tvars), global_step=global_step)
# Normally the global step update is done inside of `apply_gradients`.
# However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
# a different optimizer, you should probably take this line out.
new_global_step = global_step + 1
train_op = tf.group(train_op, [global_step.assign(new_global_step)])
return train_op
class AdamWeightDecayOptimizer(tf.train.Optimizer):
"""A basic Adam optimizer that includes "correct" L2 weight decay."""
"""A basic Adam optimizer that includes "correct" L2 weight decay."""
def __init__(self,
learning_rate,
weight_decay_rate=0.0,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-6,
exclude_from_weight_decay=None,
name="AdamWeightDecayOptimizer"):
"""Constructs a AdamWeightDecayOptimizer."""
super(AdamWeightDecayOptimizer, self).__init__(False, name)
self.learning_rate = learning_rate
self.weight_decay_rate = weight_decay_rate
self.beta_1 = beta_1
self.beta_2 = beta_2
self.epsilon = epsilon
self.exclude_from_weight_decay = exclude_from_weight_decay
def _prepare(self):
self.learning_rate_t = ops.convert_to_tensor(
self.learning_rate, name='learning_rate')
self.weight_decay_rate_t = ops.convert_to_tensor(
self.weight_decay_rate, name='weight_decay_rate')
self.beta_1_t = ops.convert_to_tensor(self.beta_1, name='beta_1')
self.beta_2_t = ops.convert_to_tensor(self.beta_2, name='beta_2')
self.epsilon_t = ops.convert_to_tensor(self.epsilon, name='epsilon')
def _create_slots(self, var_list):
for v in var_list:
self._zeros_slot(v, 'm', self._name)
self._zeros_slot(v, 'v', self._name)
def _apply_dense(self, grad, var):
learning_rate_t = math_ops.cast(
self.learning_rate_t, var.dtype.base_dtype)
beta_1_t = math_ops.cast(self.beta_1_t, var.dtype.base_dtype)
beta_2_t = math_ops.cast(self.beta_2_t, var.dtype.base_dtype)
epsilon_t = math_ops.cast(self.epsilon_t, var.dtype.base_dtype)
weight_decay_rate_t = math_ops.cast(
self.weight_decay_rate_t, var.dtype.base_dtype)
m = self.get_slot(var, 'm')
v = self.get_slot(var, 'v')
# Standard Adam update.
next_m = (
tf.multiply(beta_1_t, m) +
tf.multiply(1.0 - beta_1_t, grad))
next_v = (
tf.multiply(beta_2_t, v) + tf.multiply(1.0 - beta_2_t,
tf.square(grad)))
update = next_m / (tf.sqrt(next_v) + epsilon_t)
if self._do_use_weight_decay(self._get_variable_name(var)):
update += weight_decay_rate_t * var
update_with_lr = learning_rate_t * update
next_param = var - update_with_lr
return control_flow_ops.group(*[var.assign(next_param),
m.assign(next_m),
v.assign(next_v)])
def _resource_apply_dense(self, grad, var):
learning_rate_t = math_ops.cast(
self.learning_rate_t, var.dtype.base_dtype)
beta_1_t = math_ops.cast(self.beta_1_t, var.dtype.base_dtype)
beta_2_t = math_ops.cast(self.beta_2_t, var.dtype.base_dtype)
epsilon_t = math_ops.cast(self.epsilon_t, var.dtype.base_dtype)
weight_decay_rate_t = math_ops.cast(
self.weight_decay_rate_t, var.dtype.base_dtype)
m = self.get_slot(var, 'm')
v = self.get_slot(var, 'v')
# Standard Adam update.
next_m = (
tf.multiply(beta_1_t, m) +
tf.multiply(1.0 - beta_1_t, grad))
next_v = (
tf.multiply(beta_2_t, v) + tf.multiply(1.0 - beta_2_t,
tf.square(grad)))
update = next_m / (tf.sqrt(next_v) + epsilon_t)
if self._do_use_weight_decay(var.name):
update += weight_decay_rate_t * var
update_with_lr = learning_rate_t * update
next_param = var - update_with_lr
return control_flow_ops.group(*[var.assign(next_param),
m.assign(next_m),
v.assign(next_v)])
def _apply_sparse_shared(self, grad, var, indices, scatter_add):
learning_rate_t = math_ops.cast(
self.learning_rate_t, var.dtype.base_dtype)
beta_1_t = math_ops.cast(self.beta_1_t, var.dtype.base_dtype)
beta_2_t = math_ops.cast(self.beta_2_t, var.dtype.base_dtype)
epsilon_t = math_ops.cast(self.epsilon_t, var.dtype.base_dtype)
weight_decay_rate_t = math_ops.cast(
self.weight_decay_rate_t, var.dtype.base_dtype)
m = self.get_slot(var, 'm')
v = self.get_slot(var, 'v')
m_t = state_ops.assign(m, m * beta_1_t,
use_locking=self._use_locking)
m_scaled_g_values = grad * (1 - beta_1_t)
with ops.control_dependencies([m_t]):
m_t = scatter_add(m, indices, m_scaled_g_values)
v_scaled_g_values = (grad * grad) * (1 - beta_2_t)
v_t = state_ops.assign(v, v * beta_2_t, use_locking=self._use_locking)
with ops.control_dependencies([v_t]):
v_t = scatter_add(v, indices, v_scaled_g_values)
update = m_t / (math_ops.sqrt(v_t) + epsilon_t)
if self._do_use_weight_decay(var.name):
update += weight_decay_rate_t * var
update_with_lr = learning_rate_t * update
var_update = state_ops.assign_sub(var,
update_with_lr,
use_locking=self._use_locking)
return control_flow_ops.group(*[var_update, m_t, v_t])
def _apply_sparse(self, grad, var):
return self._apply_sparse_shared(
grad.values, var, grad.indices,
lambda x, i, v: state_ops.scatter_add( # pylint: disable=g-long-lambda
x, i, v, use_locking=self._use_locking))
def _resource_scatter_add(self, x, i, v):
with ops.control_dependencies(
[resource_variable_ops.resource_scatter_add(
x.handle, i, v)]):
return x.value()
def _resource_apply_sparse(self, grad, var, indices):
return self._apply_sparse_shared(
grad, var, indices, self._resource_scatter_add)
def _do_use_weight_decay(self, param_name):
"""Whether to use L2 weight decay for `param_name`."""
if not self.weight_decay_rate:
return False
if self.exclude_from_weight_decay:
for r in self.exclude_from_weight_decay:
if re.search(r, param_name) is not None:
return False
return True