驱动云项目任务一打卡——猫狗的分类识别

Step 1 学习准备

下载好代码文件:DogsVsCats.py

import argparse
import tensorflow as tf
import os

parser = argparse.ArgumentParser(description='Process some integers')
parser.add_argument('--mode', default='train', help='train or test')
parser.add_argument("--num_epochs", default=5, type=int)
parser.add_argument("--batch_size", default=32, type=int)
parser.add_argument("--learning_rate", default=0.001)
parser.add_argument("--data_dir", default="/gemini/data-1")
parser.add_argument("--train_dir", default="/gemini/output")
args = parser.parse_args()


def _decode_and_resize(filename, label):
    image_string = tf.io.read_file(filename)
    image_decoded = tf.image.decode_jpeg(image_string, channels=3)
    image_resized = tf.image.resize(image_decoded, [150, 150]) / 255.0
    return image_resized, label


if __name__ == "__main__":
    train_dir = args.data_dir + "/train"
    cats = []
    dogs = []
    for file in os.listdir(train_dir):
        if file.startswith("dog"):
            dogs.append(train_dir + "/" + file)
        else:
            cats.append(train_dir + "/" + file)
    print("dogSize:%d catSize:%d" % (len(cats), len(dogs)))
    train_cat_filenames = tf.constant(cats[:10000])
    train_dog_filenames = tf.constant(dogs[:10000])
    train_filenames = tf.concat([train_cat_filenames, train_dog_filenames], axis=-1)
    train_labels = tf.concat([
        tf.zeros(train_cat_filenames.shape, dtype=tf.int32),
        tf.ones(train_dog_filenames.shape, dtype=tf.int32)
    ], axis=-1)

    train_dataset = tf.data.Dataset.from_tensor_slices((train_filenames, train_labels))

    train_dataset = train_dataset.map(map_func=_decode_and_resize,
                                      num_parallel_calls=tf.data.experimental.AUTOTUNE)
    # train_dataset = train_dataset.shuffle(buffer_size=20000)
    train_dataset = train_dataset.batch(args.batch_size)
    train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)
    model = tf.keras.Sequential([
        tf.keras.layers.Conv2D(32, 3, activation="relu", input_shape=(150, 150, 3)),
        tf.keras.layers.MaxPool2D(),
        tf.keras.layers.Conv2D(64, 3, activation="relu"),
        tf.keras.layers.MaxPool2D(),
        tf.keras.layers.Conv2D(128, 3, activation="relu"),
        tf.keras.layers.MaxPool2D(),
        tf.keras.layers.Conv2D(128, 3, activation="relu"),
        tf.keras.layers.MaxPool2D(),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(512, activation="relu"),
        tf.keras.layers.Dense(2, activation="softmax")
    ])
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=args.learning_rate),
        loss=tf.keras.losses.sparse_categorical_crossentropy,
        metrics=[tf.keras.metrics.sparse_categorical_accuracy]
    )
    model.fit(train_dataset, epochs=args.num_epochs)
    model.save(args.train_dir)

    # 构建测试数据集
    test_cat_filenames = tf.constant(cats[10000:])
    test_dog_filenames = tf.constant(dogs[10000:])
    test_filenames = tf.concat([test_cat_filenames, test_dog_filenames], axis=-1)
    test_labels = tf.concat([
        tf.zeros(test_cat_filenames.shape, dtype=tf.int32),
        tf.ones(test_dog_filenames.shape, dtype=tf.int32)
    ], axis=-1)
    test_dataset = tf.data.Dataset.from_tensor_slices((test_filenames, test_labels))
    test_dataset = test_dataset.map(_decode_and_resize)
    test_dataset = test_dataset.batch(args.batch_size)
    sparse_categorical_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()
    for images, label in test_dataset:
        y_pred = model.predict(images)
        sparse_categorical_accuracy.update_state(y_true=label, y_pred=y_pred)
    print("test accuracy:%f" % sparse_categorical_accuracy.result())

和dogsVsCatsDistributed.py

import argparse
import tensorflow as tf
import os
import json

parser = argparse.ArgumentParser(description='Process some integers')
parser.add_argument('--mode', default='train', help='train or test')
parser.add_argument("--num_epochs", default=1, type=int)
parser.add_argument("--batch_size", default=32, type=int)
parser.add_argument("--learning_rate", default=0.001)
parser.add_argument("--data_dir", default="/gemini/data-1")
parser.add_argument("--train_dir", default="/gemini/output")
parser.add_argument("--num_workers", default=2, type=int)
parser.add_argument("--rank", default=0, type=int)
args = parser.parse_args()

AUTOTUNE = tf.data.experimental.AUTOTUNE
gpus = tf.config.list_physical_devices(device_type='GPU')
num_workers = args.num_workers
workList = []
for i in range(num_workers):
    work = os.environ.get("GEMINI_IP_taskrole1_%d" % i) + ":" + os.environ.get(
        "GEMINI_taskrole1_%d_http_PORT" % i)
    workList.append(work)

os.environ['TF_CONFIG'] = json.dumps({
    'cluster': {
        'worker': workList
    },
    'task': {'type': 'worker', 'index': args.rank}
})
print("TF_CONFIG:", os.environ.get("TF_CONFIG"))
strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()


def _decode_and_resize(filename, label):
    image_string = tf.io.read_file(filename)
    image_decoded = tf.image.decode_jpeg(image_string, channels=3)
    image_resized = tf.image.resize(image_decoded, [224, 224]) / 255.0
    return image_resized, label


if __name__ == "__main__":
    if len(gpus) != 0:
        BATCH_SIZE_PER_REPLICA = args.batch_size * len(gpus)
    else:
        BATCH_SIZE_PER_REPLICA = args.batch_size
    BATCH_SIZE = BATCH_SIZE_PER_REPLICA * args.num_workers
    train_dir = args.data_dir + "/train"
    cats = []
    dogs = []
    for file in os.listdir(train_dir):
        if file.startswith("dog"):
            dogs.append(train_dir + "/" + file)
        else:
            cats.append(train_dir + "/" + file)
    print("dogSize:%d catSize:%d" % (len(cats), len(dogs)))
    train_cat_filenames = tf.constant(cats[:10000])
    train_dog_filenames = tf.constant(dogs[:10000])
    train_filenames = tf.concat([train_cat_filenames, train_dog_filenames], axis=-1)
    train_labels = tf.concat([
        tf.zeros(train_cat_filenames.shape, dtype=tf.int32),
        tf.ones(train_dog_filenames.shape, dtype=tf.int32)
    ], axis=-1)

    train_dataset = tf.data.Dataset.from_tensor_slices((train_filenames, train_labels))

    train_dataset = train_dataset.map(map_func=_decode_and_resize,
                                      num_parallel_calls=tf.data.experimental.AUTOTUNE)
    train_dataset = train_dataset.apply(tf.data.experimental.shuffle_and_repeat(buffer_size=10000))
    train_dataset = train_dataset.batch(BATCH_SIZE)
    train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)
    steps_per_epoch = tf.math.ceil(20000 / BATCH_SIZE).numpy()  # 算出step的真实数量
    print("=====================================")
    print("steps_per_epoch:", steps_per_epoch)
    print("batch_size:", BATCH_SIZE)
    print("epoch:", args.num_epochs)
    print("gpus:", gpus)
    print("=====================================")
    with strategy.scope():
        model = tf.keras.Sequential([
            tf.keras.layers.Conv2D(32, 3, activation="relu", input_shape=(224, 224, 3)),
            tf.keras.layers.MaxPool2D(),
            tf.keras.layers.Conv2D(64, 3, activation="relu"),
            tf.keras.layers.MaxPool2D(),
            tf.keras.layers.Conv2D(128, 3, activation="relu"),
            tf.keras.layers.MaxPool2D(),
            tf.keras.layers.Conv2D(128, 3, activation="relu"),
            tf.keras.layers.MaxPool2D(),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(512, activation="relu"),
            tf.keras.layers.Dense(2, activation="softmax")
        ])
        checkpoint = tf.train.Checkpoint(model=model)
        manager = tf.train.CheckpointManager(checkpoint, directory=args.train_dir, checkpoint_name="model.ckpt",
                                             max_to_keep=1)
        model.compile(
            optimizer=tf.keras.optimizers.Adam(learning_rate=args.learning_rate),
            loss=tf.keras.losses.sparse_categorical_crossentropy,
            metrics=[tf.keras.metrics.sparse_categorical_accuracy]
        )
        model.fit(train_dataset, epochs=args.num_epochs, steps_per_epoch=steps_per_epoch)
    if args.rank == 0:
        print("save model")
        manager.save()

Step 2 创建项目

  1. 进入趋动云用户工作台;
  2. 填写项目名称及项目描述;
  3. 添加镜像:选择含 TensorFlow 2.x 框架的官方镜像即可,如图三所示;
  4. 添加绑定数据集:选择公开数据集,DogsVsCats。

Step 3 准备代码

Step 4 初始化开发环境

  1. 单击右上角的 运行代码,进入 初始化开发环境 页;
  2. 填写开发环境的初始化配置;
  3. 当右侧的 网页终端 和 JupyterLab 不再是灰色时,表明工具注入成功。此时您便可在此开发环境上通过工具进行模型调优。

Step 5 调试代码

开发环境创建好后,您可在开发环境中调试代码。

  1. 单击 开发环境实例 页右侧的 JupyterLab 工具。
  2. 默认进入 /gemini/ 目录下,在右侧目录树中单击 code 文件夹,进入到 /gemini/code/ 目录下。
    注意:只能在/gemini/code/ 目录下编辑和新增代码,其他文件夹均属于临时存储,临时存储是开发环境本身运行需占用的存储,代码保存在临时存储中将加速临时存储耗尽,易导致开发环境重启,重启后保存在临时存储中的代码将丢失。

Step 6 提交离线训练

当完成本次调优,可参考如下步骤保存代码并使用当前版本代码提交训练任务。

FAQ

  1. 训练失败,提示 代码执行错误:可能是提交训练时选择的镜像不对。
  2. 训练失败,提示 内存不足:可能是提交训练时配置的资源不足
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

homer_of

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值