Tensorflow ParameterServerStrategy Training实战

最新推荐文章于 2023-06-30 15:15:03 发布

ERROR_LESS

最新推荐文章于 2023-06-30 15:15:03 发布

阅读量552

点赞数 1

本文链接：https://blog.csdn.net/qq_47058489/article/details/125357381

版权

docker 同时被 2 个专栏收录

28 篇文章 2 订阅

订阅专栏

python

19 篇文章 0 订阅

订阅专栏

基于前面的基础知识积累，废话不多说，直接实战～

1 项目实践

参考github项目
前文使用Docker准备了5个相关开发环境，刚好来试试。

1.1 代码准备

项目源代码如下：
worker and ps.ipynb

import os
import json
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # 记录日志

# Cluster setup
tf_config = {
    'cluster': {
        'worker': ['192.168.1.1:12345', '192.168.1.2:12345'],
        'ps': ['192.168.1.3:12345', '192.168.1.4:12345'],
        'chief': ['192.168.1.5:12345']
    },
    'task': {'type': 'worker', 'index': 0},
    # 'task': {'type': 'ps', 'index': 0}
}
os.environ.pop('TF_CONFIG', None)
os.environ['TF_CONFIG'] = json.dumps(tf_config)

cluster_resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver() # 集群解析器
if cluster_resolver.task_type == 'ps':
    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'  # 若解析为ps类型，则仅使用最后一块GPU
    print('Parameter server detected')
elif cluster_resolver.task_type == 'worker':
    gpu_devices = tf.config.list_physical_devices('GPU') 
    if len(gpu_devices) == 0: raise SystemError('GPU device not found')
    for gpu in gpu_devices: 
        tf.config.experimental.set_memory_growth(gpu, True) # 若解析为worker则设置所有GPU为自增长
    print('Worker detected with GPU(s):', gpu_devices)
else: raise SystemError('Machine in wrong role')

# Allow reporting worker and ps failure to the coordinator
# 设置环境变量使得允许工作器和参数服务器报告错误给协调者
# 这个设置在将来可能不需要
os.environ['GRPC_FAIL_FAST'] = 'use_caller'

# Start a TensorFlow server and wait.
# 启动一个server并等待
server = tf.distribute.Server(
    cluster_resolver.cluster_spec(),
    job_name = cluster_resolver.task_type,
    task_index = cluster_resolver.task_id,
    protocol = cluster_resolver.rpc_layer or 'grpc', # 协议相关
    start = True
)
server.join()

coordinator.ipynb

import os
import json
import h5py
import tensorflow as tf
tf.get_logger().setLevel('ERROR')
%load_ext tensorboard

# Cluster setup
tf_config = {
    'cluster': {
        'worker': ['192.168.1.1:12345', '192.168.1.2:12345'],
        'ps': ['192.168.1.3:12345', '192.168.1.4:12345'],
        'chief': ['192.168.1.5:12345']
    },
    'task': {'type': 'chief', 'index': 0}
}
os.environ.pop('TF_CONFIG', None)
os.environ['TF_CONFIG'] = json.dumps(tf_config)

# Allow reporting worker and ps failure to the coordinator
os.environ['GRPC_FAIL_FAST'] = 'use_caller'

# Instantiate a ParameterServerStrategy
# 指定对变量进行分片
variable_partitioner = (
    tf.distribute.experimental.partitioners.MinSizePartitioner(
        min_shard_bytes = (256 << 10), # 它为每个分片分配至少 256K
        max_shards = len(tf_config['cluster']['ps']) # 每个 ps 最多得到一个分片
    )
)
# 初始化ps策略实例
strategy = tf.distribute.experimental.ParameterServerStrategy(
    tf.distribute.cluster_resolver.TFConfigClusterResolver(), # 建立解析器
    variable_partitioner = variable_partitioner # 建立分片器
)
strategy

# Path setup
TRAIN_PATH = 'Dataset/Train'
VALIDATE_PATH = 'Dataset/Validate'
TEST_PATH = 'Dataset/Test'

MODEL_PATH = 'Model'
MODEL_CKPT = os.path.join(MODEL_PATH, 'ckpt-{epoch}')
MODEL_TRAINED = os.path.join(MODEL_PATH, 'model.hdf5')
MODEL_BACKUP = os.path.join(MODEL_PATH, 'backup')

# Preparing data
CLASSES = 30
IMAGE_SIZE = (224, 224)
PER_WORKER_BATCH_SIZE = 32
NUM_WORKERS = len(tf_config['cluster']['worker'])
GLOBAL_BATCH_SIZE = PER_WORKER_BATCH_SIZE * NUM_WORKERS
EPOCHS = 3

from tensorflow.keras.preprocessing.image import ImageDataGenerator # ImageDataGenerator数据增强，扩充数据集大小，提高模型泛化能力
train_generator = ImageDataGenerator(
    rescale = 1./255, # 所有数据集将乘以该数值，将像素值限制在0，1之间
    rotation_range = 40,  # 随机旋转角度数范围
    width_shift_range = 0.2,  # 随机宽度偏移量
    height_shift_range = 0.2,  # 随机高度偏移量
    shear_range = 0.2, # 让所有的点的x坐标或y坐标其中一个按比例平移，另外一个不动
    zoom_range = 0.2, # 随机缩放的范围[1-n,1+n]
    horizontal_flip = True # 是否随机水平翻转
)

# Input data
def train_dataset_fn(input_context):
    batch_size = input_context.get_per_replica_batch_size(GLOBAL_BATCH_SIZE)
    train_dataset = tf.data.Dataset.from_generator(
        lambda: train_generator.flow_from_directory(
            TRAIN_PATH, 
            target_size = IMAGE_SIZE, 
            batch_size = batch_size
        ), 
        output_types = (tf.float32, tf.float32), 
        output_shapes = ([batch_size, *IMAGE_SIZE, 3], [batch_size, CLASSES])
    ).shard(
        input_context.num_input_pipelines, 
        input_context.input_pipeline_id
    ).cache()
    return train_dataset.prefetch(buffer_size=tf.data.AUTOTUNE)

from tensorflow.keras.utils.experimental import DatasetCreator
train_dataset = DatasetCreator(train_dataset_fn)
num_train = !find {TRAIN_PATH} -type f | wc -l
num_train = int(num_train[0])
print(f'Found {num_train} files')

# Model implement
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout
from tensorflow.keras.models import Model

# Define the model
def build_and_compile_model():
    base_model = MobileNetV2(
        input_shape = IMAGE_SIZE + (3,), 
        include_top = False,
        weights = None
    )
    
    x = preprocess_input(base_model.output)
    x = GlobalAveragePooling2D()(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.2)(x)
    outputs = Dense(CLASSES, activation='softmax')(x)
    
    model = Model(inputs=base_model.input, outputs=outputs)
    model.compile(
        optimizer = 'adam', 
        loss = 'categorical_crossentropy', 
        metrics = ['accuracy']
    )
    return model

# Callbacks
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint 
from tensorflow.keras.callbacks import Callback, LearningRateScheduler
from tensorflow.keras.callbacks.experimental import BackupAndRestore

def decay(epoch):
    if epoch < 3: return 1e-3
    elif epoch >= 3 and epoch < 7: return 1e-4
    return 1e-5

# Define a callback for printing the learning rate at the end of each epoch.
class PrintLR(Callback):
    def on_epoch_end(self, epoch, logs=None):
        print(f'\nLearning rate for epoch {epoch + 1} is {model.optimizer.lr.numpy()}')

callbacks = [
    TensorBoard(log_dir='./logs'),
    BackupAndRestore(backup_dir=MODEL_BACKUP),
    ModelCheckpoint(filepath=MODEL_CKPT, save_weights_only=True, verbose=1),
    LearningRateScheduler(decay),
    PrintLR()
]
!rm -rf logs

# Training
with strategy.scope(): 
    model = build_and_compile_model()

history = model.fit(
    train_dataset,
    epochs = EPOCHS,
    steps_per_epoch = num_train // (GLOBAL_BATCH_SIZE * NUM_WORKERS),
    # steps_per_epoch = num_train // GLOBAL_BATCH_SIZE,
    # callbacks = callbacks,
    # verbose = 1, # not allowed with ParameterServerStrategy
)
model.save(MODEL_TRAINED)

model.save(MODEL_TRAINED)
%tensorboard --logdir=logs

evaluator.ipynb

import os
import json
import tensorflow as tf
tf.get_logger().setLevel('ERROR')
%load_ext tensorboard

# Cluster setup
tf_config = {
    'cluster': {'evaluator': ['192.168.1.6:12345']},
    'task': {'type': 'evaluator', 'index': 0}
}
os.environ.pop('TF_CONFIG', None)
os.environ['TF_CONFIG'] = json.dumps(tf_config)
tf.distribute.cluster_resolver.TFConfigClusterResolver()

# Path setup
TRAIN_PATH = 'Dataset/Train'
VALIDATE_PATH = 'Dataset/Validate'
TEST_PATH = 'Dataset/Test'

MODEL_PATH = 'Model'
MODEL_CKPT = os.path.join(MODEL_PATH, 'ckpt-{epoch}')

# Preparing data
CLASSES = 30
IMAGE_SIZE = (224, 224)
GLOBAL_BATCH_SIZE = 64

from tensorflow.keras.preprocessing.image import ImageDataGenerator
validate_generator = ImageDataGenerator(rescale=1./255)
generated_validate_data = validate_generator.flow_from_directory(
    VALIDATE_PATH, 
    target_size = IMAGE_SIZE, 
    batch_size = GLOBAL_BATCH_SIZE
)

validate_dataset = tf.data.Dataset.from_generator(
    lambda: generated_validate_data, 
    output_types = (tf.float32, tf.float32), 
    output_shapes = (
        [GLOBAL_BATCH_SIZE, *IMAGE_SIZE, 3], 
        [GLOBAL_BATCH_SIZE, CLASSES]
    )
).cache().prefetch(buffer_size=tf.data.AUTOTUNE)

# Define the model
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout
from tensorflow.keras.models import Model

def build_and_compile_model():
    base_model = MobileNetV2(
        input_shape = IMAGE_SIZE + (3,), 
        include_top = False,
        weights = None
    )
    
    x = preprocess_input(base_model.output)
    x = GlobalAveragePooling2D()(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.2)(x)
    outputs = Dense(CLASSES, activation='softmax')(x)
    
    model = Model(inputs=base_model.input, outputs=outputs)
    model.compile(
        optimizer = 'adam', 
        loss = 'categorical_crossentropy', 
        metrics = ['accuracy']
    )
    return model

model = build_and_compile_model()
model.summary()

# Side-car evaluation
tf.keras.experimental.SidecarEvaluator(
    model = model,
    data = validate_dataset,
    checkpoint_dir = MODEL_CKPT, # Dir for training-saved checkpoint
    steps = None, # Evaluate until dataset is exhausted
    max_evaluations = None, # The evaluation needs to be stopped manually
    callbacks = [TensorBoard(log_dir='./logs')]
).start()
%tensorboard --logdir=logs

1.2 转换代码

在本地运行代码需要转换为.py文件

主要是一下转换：

导入tensorboard：%load_ext tensorboard–>get_ipython().run_line_magic('load_ext', 'tensorboard')
删除日志：!rm -rf logs–>get_ipython().system('rm -rf logs')
运行tensorboard：%tensorboard --logdir=logs–>get_ipython().run_line_magic('tensorboard', '--logdir=logs')

1.3 修改单机代码

1.3.1 注释掉jupyter文件中转换后的魔法函数

需要注释掉jupyter文件中转换后的魔法函数，否则直接运行会出现以下错误

Traceback (most recent call last):
  File "/home/hqc/container_share/ml-distributed-training-main/single-training/flowers_single.py", line 11, in <module>
    get_ipython().run_line_magic('load_ext', 'tensorboard')
NameError: name 'get_ipython' is not defined

1.3.2 但后续代码中仍然会用到`get_ipython`

解决：
代码中加入from IPython import get_ipython，一般会报错找不到IPython，直接安装即可：pip install Ipython

这个问题暂时解决

1.3.3 Rescaling无法导入

报错：
ImportError: cannot import name 'Rescaling' from 'tensorflow.keras.layers' (/home/hqc/anaconda3/envs/tf/lib/python3.9/site-packages/tensorflow/keras/layers/__init__.py)

1.3.4 getoutput报错

Traceback (most recent call last):
  File "/home/hqc/container_share/ml-distributed-training-main/single-training/flowers_single.py", line 68, in <module>
    data_length = get_ipython().getoutput('find {data_root} -name *.jpg | wc -l')
AttributeError: 'NoneType' object has no attribute 'getoutput'

1.3.5 解决方法

太多类似的import问题和找不到包的问题，很可能是版本的问题。
但不可能重新配置开发环境了，太过于麻烦了，因此放弃运行单机的源码验证，学习parameterstrategy的思路即可。

1.4 修改ParameterStrategy代码

1.4.1 集群IP修改

针对提供的开发环境，设置集群IP

tf_config = {
    'cluster': {
        'worker': ['172.72.0.4:12345', '172.72.0.5:12345'],
        'ps': ['172.72.0.2:12345', '172.72.0.3:12345'],
        'chief': ['172.72.0.6:12345']
    },
    'task': {'type': 'worker', 'index': 0},
    # 'task': {'type': 'ps', 'index': 0}
}
os.environ.pop('TF_CONFIG', None)
os.environ['TF_CONFIG'] = json.dumps(tf_config)

注意：不同的代码type和index的值也应相应地修改。

1.4.2 GPU设置修改

每个节点上的代码都应设置只有一个GPU可见，因为ps策略要求所有节点上的GPU数目相同。
否则coordinator将会报以下错误：NotImplementedError: Multi-gpu is not supported yet.

设置单个GPU可见的指令：os.environ['CUDA_VISIBLE_DEVICES']='0'，加到代码中即可
但由于本机上只有2块GPU，不够5台节点分，因此实际上这些集群代码无法运行起来。

解决方法是：转向云服务器租用GPU进行实验。

2 学习如何修改为ParameterStrategy

2.1 worker and ps

worker和ps节点运行的代码和整体需要实现的逻辑没什么关联，感觉就是提供了一个有强大算力的机器以供整个模型训练使用，代码里也主要就是一些设置，包括：

第一部分：设置集群IP，让集群可以发现并使用
第二部分：集群IP解析器，让集群区分是worker还是ps
第三部分：区分之后对应节点功能不同设置GPU的调用
第四部分：创建一个server，并等待接收coordinator的指令

2.2 coordinator

主要就是依靠coordinator来进行协调联系，发送指令，代码主体需要实现模型逻辑。
与MultiWorkerMirroredStrategy的区别主要包括：

好像没设置GPU，为什么呢？暂时还不明白。
集群内的节点增多，对应IP都得加上；coordinator的task中的类型得指定为chief。
多机多卡初始化MultiWorkerMirroredStrategy时，参数包括一个通信选项（RING），并且采用的是自动分片策略；而ps策略初始化ParameterServerStrategy时，需要先定义一个变量分片器手动分片，并且参数包括集群解析器和前面定义的变量分片器。
数据准备方面只做了训练集的数据增强
创建数据集得使用DatasetCreator，训练集的操作须被封装在一个函数里边。

ERROR_LESS

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
Tensorflow ParameterServerStrategy Training实战

基于前面的基础知识积累，废话不多说，直接实战～参考github项目前文使用Docker准备了5个相关开发环境，刚好来试试。项目源代码如下：1.2 转换代码在本地运行代码需要转换为文件主要是一下转换：需要注释掉jupyter文件中转换后的魔法函数，否则直接运行会出现以下错误1.3.2 但后续代码中仍然会用到解决：代码中加入，一般会报错找不到，直接安装即可：这个问题暂时解决报错：1.3.5 解决方法太多类似的import问题和找不到包的问题，很可能是版本的问题。但不可能重新配置开发环境
复制链接

扫一扫