Horovod——分布式案例

Horovod案例——tensorflow

  1. 导入案例代码

    # 1. 挂载NFS网络共享文件夹(node2、node3执行)
    sudo mount -t nfs node1:/export/data/share /export/data/share
    
    # 2. 激活horovod虚拟环境(node1节点)
    conda activate horovod
    
    # 2. 创建tensorflow文件夹,进入共享空间。
    mkdir -p /export/data/share/tensorflow/data
    mkdir -p /export/data/share/tensorflow/code
    cd /export/data/share/tensorflow/code
    
    # 3. 编辑代码
    vim tensorflow2_keras_mnist.py
    
    # 4. 分布式运行代码
    horovodrun -np 6 -H node1:2,node2:2,node3:2 python tensorflow2_keras_mnist.py
    
    import sys
    import tensorflow as tf
    import horovod
    import horovod.tensorflow.keras as hvd
    
    
    def main():
        # Horovod: initialize Horovod.
        hvd.init()
    
        # Horovod: pin GPU to be used to process local rank (one GPU per process)
        gpus = tf.config.experimental.list_physical_devices('GPU')
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        if gpus:
            tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
    
        (mnist_images, mnist_labels), _ = \
            tf.keras.datasets.mnist.load_data(path='/export/data/share/tensorflow/data/mnist-%d.npz' % hvd.rank())
    
        dataset = tf.data.Dataset.from_tensor_slices(
            (tf.cast(mnist_images[..., tf.newaxis] / 255.0, tf.float32),
                     tf.cast(mnist_labels, tf.int64))
        )
        dataset = dataset.repeat().shuffle(10000).batch(128)
    
        mnist_model = tf.keras.Sequential([
            tf.keras.layers.Conv2D(32, [3, 3], activation='relu'),
            tf.keras.layers.Conv2D(64, [3, 3], activation='relu'),
            tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
            tf.keras.layers.Dropout(0.25),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(10, activation='softmax')
        ])
    
        # Horovod: adjust learning rate based on number of GPUs.
        scaled_lr = 0.001 * hvd.size()
        opt = tf.optimizers.Adam(scaled_lr)
    
        # Horovod: add Horovod DistributedOptimizer.
        opt = hvd.DistributedOptimizer(
            opt, backward_passes_per_step=1, average_aggregated_gradients=True)
    
        # Horovod: Specify `experimental_run_tf_function=False` to ensure TensorFlow
        # uses hvd.DistributedOptimizer() to compute gradients.
        mnist_model.compile(loss=tf.losses.SparseCategoricalCrossentropy(),
                            optimizer=opt,
                            metrics=['accuracy'],
                            experimental_run_tf_function=False)
    
        callbacks = [
            # Horovod: broadcast initial variable states from rank 0 to all other processes.
            # This is necessary to ensure consistent initialization of all workers when
            # training is started with random weights or restored from a checkpoint.
            hvd.callbacks.BroadcastGlobalVariablesCallback(0),
    
            # Horovod: average metrics among workers at the end of every epoch.
            #
            # Note: This callback must be in the list before the ReduceLROnPlateau,
            # TensorBoard or other metrics-based callbacks.
            hvd.callbacks.MetricAverageCallback(),
    
            # Horovod: using `lr = 1.0 * hvd.size()` from the very beginning leads to worse final
            # accuracy. Scale the learning rate `lr = 1.0` ---> `lr = 1.0 * hvd.size()` during
            # the first three epochs. See https://arxiv.org/abs/1706.02677 for details.
            hvd.callbacks.LearningRateWarmupCallback(initial_lr=scaled_lr, warmup_epochs=3, verbose=1),
        ]
    
        # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
        if hvd.rank() == 0:
            callbacks.append(tf.keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5'))
    
        # Horovod: write logs on worker 0.
        verbose = 1 if hvd.rank() == 0 else 0
    
        # Train the model.
        # Horovod: adjust number of steps based on number of GPUs.
        mnist_model.fit(dataset, steps_per_epoch=90 // hvd.size(), callbacks=callbacks, epochs=2, verbose=verbose)
    
    
    if __name__ == '__main__':
        if len(sys.argv) == 4:
            # run training through horovod.run
            np = int(sys.argv[1])
            hosts = sys.argv[2]
            comm = sys.argv[3]
            print('Running training through horovod.run')
            horovod.run(main, np=np, hosts=hosts, use_gloo=comm == 'gloo', use_mpi=comm == 'mpi')
        else:
            # this is running via horovodrun
            main()
    
    
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值