TensorFlow 2.0 - tf.data.Dataset 数据预处理 & 猫狗分类

学习于:简单粗暴 TensorFlow 2

1 tf.data.Dataset.from_tensor_slices() 数据集建立

tf.data.Dataset.from_tensor_slices()

import matplotlib.pyplot as plt
(train_data, train_label), (_, _) = tf.keras.datasets.mnist.load_data()
train_data = np.expand_dims(train_data.astype(np.float32)/255., axis=-1)

mnistdata = tf.data.Dataset.from_tensor_slices((train_data, train_label))

for img, label in mnistdata:
    plt.title(label.numpy())
    plt.imshow(img.numpy())
    plt.show()

2. Dataset.map(f) 数据集预处理

  • Dataset.map(f) 应用变换
def rotate90(img, label):
    img = tf.image.rot90(img)
    return img, label

mnistdata = mnistdata.map(rotate90)

在这里插入图片描述

  • Dataset.batch(batch_size) 分批
mnistdata = mnistdata.batch(5)

for img, label in mnistdata:  # img [5,28,28,1],label [5] 包含 5个样本 
    fig, axs = plt.subplots(1, 5)  # 1 行 5列
    for i in range(5):
        axs[i].set_title(label.numpy()[i])
        axs[i].imshow(img.numpy()[i, :, :, :])
    plt.show()

  • Dataset.shuffle(buffer_size) 随机打乱
    buffer_size = 1,没有打乱的效果
    数据集较随机,buffer_size 可小一些,否则,设置大一些
    我在做猫狗分类例子的时候,遇到内存不足的报错,建议可以提前打乱数据
# 每次得到的数字不太一样
mnistdata = mnistdata.shuffle(buffer_size=100).batch(5)

3. Dataset.prefetch() 并行处理

  • Dataset.prefetch() 开启预加载数据,使得在 GPU 训练的同时 CPU 可以准备数据
mnistdata = mnistdata.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
# 可设置自动寻找 合适的 buffer_size
  • num_parallel_calls 多核心并行处理
mnistdata = mnistdata.map(map_func=rotate90,num_parallel_calls=2)
# 也可以自动找参数 tf.data.experimental.AUTOTUNE

4. for 循环获取数据

# for 循环
dataset = tf.data.Dataset.from_tensor_slices((A, B, C, ...))
for a, b, c, ... in dataset:
    # 对张量a, b, c等进行操作,例如送入模型进行训练
    
# 或者 创建迭代器,使用 next() 获取
dataset = tf.data.Dataset.from_tensor_slices((A, B, C, ...))
it = iter(dataset)
a_0, b_0, c_0, ... = next(it)
a_1, b_1, c_1, ... = next(it)

5. 例子: 猫狗分类

项目及数据地址:https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/overview

The train folder contains 25,000 images of dogs and cats. Each image in this folder has the label as part of the filename. The test folder contains 12,500 images, named according to a numeric id.

For each image in the test set, you should predict a probability that the image is a dog (1 = dog, 0 = cat).

# ---------cat vs dog-------------
# https://michael.blog.csdn.net/
import tensorflow as tf
import pandas as pd
import numpy as np
import random
import os

num_epochs = 10
batch_size = 32
learning_rate = 1e-4
train_data_dir = "./dogs-vs-cats/train/"
test_data_dir = "./dogs-vs-cats/test/"

# 数据处理
def _decode_and_resize(filename, label=None):
    img_string = tf.io.read_file(filename)
    img_decoded = tf.image.decode_jpeg(img_string)
    img_resized = tf.image.resize(img_decoded, [256, 256]) / 255.
    if label == None:
        return img_resized
    return img_resized, label

# 使用 tf.data.Dataset 生成数据
def processData(train_filenames, train_labels):
    train_dataset = tf.data.Dataset.from_tensor_slices((train_filenames, train_labels))
    train_dataset = train_dataset.map(map_func=_decode_and_resize)
    # train_dataset = train_dataset.shuffle(buffer_size=25000) # 非常耗内存,不使用
    train_dataset = train_dataset.batch(batch_size)
    train_dataset = train_dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return train_dataset


if __name__ == "__main__":
	# 训练文件路径
    file_dir = [train_data_dir + filename for filename in os.listdir(train_data_dir)]
    labels = [0 if filename[0] == 'c' else 1
              for filename in os.listdir(train_data_dir)]

	# 打包并打乱
    f_l = list(zip(file_dir, labels))
    random.shuffle(f_l)
    file_dir, labels = zip(*f_l)
	
	# 切分训练集,验证集
    valid_ratio = 0.1
    idx = int((1 - valid_ratio) * len(file_dir))
    train_files, valid_files = file_dir[:idx], file_dir[idx:]
    train_labels, valid_labels = labels[:idx], labels[idx:]
	
	# 使用 tf.data.Dataset 生成数据集
    train_filenames, valid_filenames = tf.constant(train_files), tf.constant(valid_files)
    train_labels, valid_labels = tf.constant(train_labels), tf.constant(valid_labels)

    train_dataset = processData(train_filenames, train_labels)
    valid_dataset = processData(valid_filenames, valid_labels)

	# 建模 
    model = tf.keras.Sequential([
        tf.keras.layers.Conv2D(32, 3, activation='relu', input_shape=(256, 256, 3)),
        tf.keras.layers.MaxPooling2D(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Conv2D(64, 5, activation='relu'),
        tf.keras.layers.MaxPooling2D(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Conv2D(128, 5, activation='relu'),
        tf.keras.layers.MaxPooling2D(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(2, activation='softmax')
    ])

	# 模型配置
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.sparse_categorical_crossentropy,
        metrics=[tf.keras.metrics.sparse_categorical_accuracy]
    )
	# 训练
    model.fit(train_dataset, epochs=num_epochs, validation_data=valid_dataset)
	
	# 测试 test
    test_filenames = tf.constant([test_data_dir + filename for filename in os.listdir(test_data_dir)])
    test_data = tf.data.Dataset.from_tensor_slices(test_filenames)
    test_data = test_data.map(map_func=_decode_and_resize)
    test_data = test_data.batch(batch_size)

    ans = model.predict(test_data) # ans [12500, 2]
    prob = ans[:, 1] # dog 的概率

	# 写入提交文件
    id = list(range(1, 12501))
    output = pd.DataFrame({'id': id, 'label': prob})
    output.to_csv("submission.csv", index=False)

提交成绩:

榜首他人成绩:

  • 把模型改成 MobileNetV2 + FC,训练 2 个 epochs
basemodel = tf.keras.applications.MobileNetV2(input_shape=(256,256,3), include_top=False, classes=2)
model = tf.keras.Sequential([
    basemodel,
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(2, activation='softmax')
])

结果:

704/704 [==============================] - 179s 254ms/step 
- loss: 0.0741 - sparse_categorical_accuracy: 0.9737 
- val_loss: 0.1609 - val_sparse_categorical_accuracy: 0.9744
704/704 [==============================] - 167s 237ms/step 
- loss: 0.0128 - sparse_categorical_accuracy: 0.9955 
- val_loss: 0.0724 - val_sparse_categorical_accuracy: 0.9848

准确率(99%, 98%)比上面第一种模型高(第一种模型大概是训练集 92%, 验证集80%)

测试时,损失值竟然比上面的大,怎么解释?貌似第二种方案也没有过拟合吧,训练集和验证集准确率差不多。

评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Michael阿明

如果可以,请点赞留言支持我哦!

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值