Tensorflow的数据输入模块tf.data模块

基于 tf.data API,我们可以使用简单的代码来构建复杂的输入,tf.data API 可以轻松处理大量数据、不同的数据格式以及复杂的转换。tf.data.Dataset中每个元素包含一个或多个 Tensor 对象。例如,在图
片管道中,一个元素可能是单个训练样本,具有一对表示图片数据和标签的张量。
可以通过两种不同的方式来创建tf.data.Dataset

1)直接从 Tensor 创建 Dataset
	例如 Dataset.from_tensor_slices());
	当然 Numpy 也是可以的,TensorFlow 会自动将其转换为 Tensor。
(2)通过对一个或多个 tf.data.Dataset 对象来使用变换
	(例如 Dataset.zip)来创建 Dataset。

一个 Dataset 对象包含多个元素,每个元素的结构都相同。每个元素包含一个或多个 tf.Tensor 对象,这些对象被称为组件。
Dataset 的属性由构成该 Dataset 的元素的属性映射得到,元素可以是单个张量、张量元组,也可以是张量的嵌套元组。

通过一维列表创建dataset

import tensorflow as tf

# 通过一维列表创建DataSet
dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3, 3, 43, 43, 43, 43, 54, 5, 34])
# 迭代获取
for ele in dataset:
    print(ele)
    print(ele.numpy())

通过二维列表创建dataset

import tensorflow as tf

# 通过二维列表创建DataSet
dataset = tf.data.Dataset.from_tensor_slices([[1, 2], [3, 43], [43, 34]])
# 迭代获取
for ele in dataset:
    print(ele)
    print(ele.numpy())

每个slice长度需要相同,不相同出错
在这里插入图片描述

通过字典创建dataset

import tensorflow as tf

# 通过字典创建DataSet
dataset = tf.data.Dataset.from_tensor_slices({'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8], 'c': [12, 23, 23, 34]})
for ele in dataset:
    print(ele)

通过np.array创建dataset

效果和第一种相同

import tensorflow as tf
import numpy as np
# 通过np.array创建DataSet
dataset = tf.data.Dataset.from_tensor_slices(np.array([1, 2, 3, 3, 43, 43, 43, 43, 54, 5, 34]))
# 迭代获取
for ele in dataset:
    print(ele)
    print(ele.numpy())

通过take方法取数据

import tensorflow as tf
import numpy as np

# 通过np.array创建DataSet
dataset = tf.data.Dataset.from_tensor_slices(np.array([1, 2, 3, 3, 43, 43, 43, 43, 54, 5, 34]))
# 迭代获取前四个
for ele in dataset.take(4):
    print(ele)
    print(ele.numpy())
# 通过take取出第一个
print(next(iter(dataset.take(1))).numpy())

数据乱序

import tensorflow as tf

dataset = tf.data.Dataset.from_tensor_slices([[1, 2], [4, 5], [6, 7]])
for ele in dataset:
    print(ele.numpy())
# 通过shuffle对数据乱序,当对全部数据乱序时参数填dataset的长度
dataset=dataset.shuffle(3)
for ele in dataset:
    print(ele.numpy())

数据重复

import tensorflow as tf

dataset = tf.data.Dataset.from_tensor_slices([[1, 2], [4, 5], [6, 7]])
# 通过repeat方法对dataset进行重复,count为重复的次数
dataset=dataset.repeat(count=3)
for ele in dataset:
    print(ele.numpy())

数据乱序重复

import tensorflow as tf

dataset = tf.data.Dataset.from_tensor_slices([[1, 2], [4, 5], [6, 7]])
for ele in dataset:
    print(ele.numpy())
print('-------------------')
# 通过shuffle对数据乱序,通过repeat重复数据,使得生成的数据重复并乱序
dataset=dataset.shuffle(3).repeat(count=3)
for ele in dataset:
    print(ele.numpy())

取出数据

import tensorflow as tf

dataset = tf.data.Dataset.from_tensor_slices([1, 2, 4, 5, 6, 7])
for ele in dataset:
    print(ele.numpy())
print('-------------------')
# 生成大量数据,为batch函数准备数据
dataset = dataset.shuffle(3).repeat(10)
# 通过batch方法设置每次取出若干数据
dataset = dataset.batch(batch_size=3)
for ele in dataset:
    print(ele.numpy())

对每一个元素进行操作

import tensorflow as tf

dataset = tf.data.Dataset.from_tensor_slices([1, 2, 4, 5, 6, 7])
# 对所有的数据进行平方运算
dataset=dataset.map(tf.square)
for ele in dataset:
    print(ele.numpy())

实例

import gzip

import numpy as np
import tensorflow as tf


def get_data():
    # 文件获取
    train_image = r"../../dataset/fashion-mnist/train-images-idx3-ubyte.gz"
    test_image = r"../../dataset/fashion-mnist/t10k-images-idx3-ubyte.gz"
    train_label = r"../../dataset/fashion-mnist/train-labels-idx1-ubyte.gz"
    test_label = r"../../dataset/fashion-mnist/t10k-labels-idx1-ubyte.gz"  # 文件路径
    paths = [train_label, train_image, test_label, test_image]

    with gzip.open(paths[0], 'rb') as lbpath:
        y_train = np.frombuffer(lbpath.read(), np.uint8, offset=8)

    with gzip.open(paths[1], 'rb') as imgpath:
        x_train = np.frombuffer(
            imgpath.read(), np.uint8, offset=16).reshape(len(y_train), 28, 28)

    with gzip.open(paths[2], 'rb') as lbpath:
        y_test = np.frombuffer(lbpath.read(), np.uint8, offset=8)

    with gzip.open(paths[3], 'rb') as imgpath:
        x_test = np.frombuffer(
            imgpath.read(), np.uint8, offset=16).reshape(len(y_test), 28, 28)

    return (x_train, y_train), (x_test, y_test)


# 加载数据
(train_image, train_lable), (test_image, test_label) = get_data()

train_image = train_image / 255
test_image = test_image / 255

ds_train_img = tf.data.Dataset.from_tensor_slices(train_image)
ds_train_lab = tf.data.Dataset.from_tensor_slices(train_lable)

# 通过zip函数将ds_train_img和ds_train_lab合并到一起
ds_train = tf.data.Dataset.zip((ds_train_img, ds_train_lab))
ds_test = tf.data.Dataset.from_tensor_slices((test_image, test_label))
ds_train = ds_train.shuffle(10000).repeat().batch(64)
ds_test = ds_test.batch(64)
model = tf.keras.Sequential([tf.keras.layers.Flatten(input_shape=(28, 28)),
                             tf.keras.layers.Dense(128, activation='relu'),
                             tf.keras.layers.Dense(10, activation='softmax')])
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['acc'])
# train_image第一个维度时总的图片数量大小
steps_per_epoch = train_image.shape[0] // 64
valication_epoch = test_image.shape[0] // 64
model.fit(ds_train, epochs=5, steps_per_epoch=steps_per_epoch, validation_data=ds_test,
          validation_steps=valication_epoch)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值