自定义可用于深度学习训练和测试的数据集示例

Tensorflow2.0以上:

from tensorflow.keras.utils import Sequence
import numpy as np


class createDataset(Sequence):
    # 为了结合后续的fit(),一般需要继承tensorflow.keras.utils.Sequence类
    # __init__,__len__, __getitem__,这三个魔法方法是必须的
    def __init__(self, x, y):  # 进行一些初始化和预处理
        self.x, self.y = x, y
        self.preprocess()

    def __len__(self):  # 需要知道该Dataset的大小
        return min(len(self.x), len(self.y))

    def __getitem__(self, idx):  # 定义获取Dataset中指定元素的行为,相当于self[key],以便迭代循环读取
        bx = self.x[idx]
        by = self.y[idx]
        yield bx, by

    def on_epoch_end(self):
        # 每训练完一次epoch,都要进行以下操作,比如打乱数据集顺序,不是必须的
        seed = np.random.randint(1000)
        np.random.seed(seed)  # 设置随机种子,保证每次随机打乱的结果一样
        np.random.shuffle(self.x)
        np.random.seed(seed)  # 设置一样的随机种子,保证每次随机打乱的随机顺序一致
        np.random.shuffle(self.y)

    def preprocess(self):  # 定义一些预处理
        "---do something---"
        self.x = [2 * i for i in self.x]
        self.y = [300 * i for i in self.y]


if __name__ == "__main__":
    x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 0]
    y = [0, 9, 8, 7, 6, 5, 4, 3, 2, 1]
    train_dataset = createDataset(x, y)
    # model.fit(train_dataset, epochs=2, steps_per_epoch=10)
    # test
    for i in train_dataset:
        for j in i:
            print(i)
            print(j)

Pytorch:

import torch
from torch.utils import data


class createDataset(data.Dataset):
    # 为了结合后续的torch.utils.data.DataLoader(),一般需要继承torch.utils.data.Dataset类
    # __init__,__len__, __getitem__,这三个魔法方法是必须的
    def __init__(self, x, y):  # 进行一些初始化和预处理
        self.x, self.y = x, y
        self.preprocess()

    def __len__(self):  # 需要知道该Dataset的大小
        return min(len(self.x), len(self.y))

    def __getitem__(self, idx):  # 定义获取Dataset中指定元素的行为,相当于self[key],以便迭代循环读取
        bx = self.x[idx]
        by = self.y[idx]
        return bx, by

    def preprocess(self):  # 定义一些预处理
        "---do something---"
        self.x = [2 * i for i in self.x]
        self.y = [300 * i for i in self.y]

if __name__ == "__main__":
    x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 0]
    y = [0, 9, 8, 7, 6, 5, 4, 3, 2, 1]
    BATCH_SIZE = 2
    NUM_EPOCHS = 100
    train_dataset = createDataset(x, y)
    train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    for epoch in range(NUM_EPOCHS):
        for batch_idx, (x, y) in enumerate(train_dataloader):
            x_tensor = torch.autograd.Variable(x)
            y_tensor = torch.autograd.Variable(y)
            print(x_tensor, y_tensor)

当我在测试我写的train_dataset时,发现了如下问题,感觉好奇怪:

class createDataset1():
    # __init__,__len__, __getitem__,这三个魔法方法是必须的
    def __init__(self, x, y):  # 进行一些初始化和预处理
        self.x, self.y = x, y
        self.preprocess()

    def __len__(self):  # 需要知道该Dataset的大小
        return min(len(self.x), len(self.y))

    def __getitem__(self, idx):  # 定义获取Dataset中指定元素的行为,相当于self[key],以便迭代循环读取
        bx = self.x[idx]
        by = self.y[idx]
        yield bx, by

    def preprocess(self):  # 定义一些预处理
        "---do something---"
        self.x = [2 * i for i in self.x]
        self.y = [300 * i for i in self.y]


class createDataset2():
    # __init__,__len__, __getitem__,这三个魔法方法是必须的
    def __init__(self, x, y):  # 进行一些初始化和预处理
        self.x, self.y = x, y
        self.preprocess()

    def __len__(self):  # 需要知道该Dataset的大小
        return min(len(self.x), len(self.y))

    def __getitem__(self, idx):  # 定义获取Dataset中指定元素的行为,相当于self[key],以便迭代循环读取
        bx = self.x[idx]
        by = self.y[idx]
        return bx, by

    def preprocess(self):  # 定义一些预处理
        "---do something---"
        self.x = [2 * i for i in self.x]
        self.y = [300 * i for i in self.y]



if __name__ == "__main__":
    x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 0]
    y = [0, 9, 8, 7, 6, 5, 4, 3, 2, 1]
    # train_dataset = createDataset1(x, y)
    train_dataset = createDataset2(x, y)
    print(type(train_dataset))
    for i in train_dataset:
        for j in i:
            print(i)
            print(j)

在createDataset完全一样的情况下,上面的输出竟然不一样!

当我使用train_dataset = createDataset1(x, y),注释掉train_dataset = createDataset2(x, y)时,

输出如下所示:

Traceback (most recent call last):
  File "/.../3.py", line 48, in <module>
    for j in i:
  File "/.../3.py", line 11, in __getitem__
    bx = self.x[idx]
IndexError: list index out of range
<class '__main__.createDataset1'>
<generator object createDataset1.__getitem__ at 0x7fd058e8ef20>
(2, 0)
<generator object createDataset1.__getitem__ at 0x7fd058e8eeb0>
(4, 2700)
<generator object createDataset1.__getitem__ at 0x7fd058e8ef20>
(6, 2400)
<generator object createDataset1.__getitem__ at 0x7fd058e8eeb0>
(8, 2100)
<generator object createDataset1.__getitem__ at 0x7fd058e8ef20>
(10, 1800)
<generator object createDataset1.__getitem__ at 0x7fd058e8eeb0>
(12, 1500)
<generator object createDataset1.__getitem__ at 0x7fd058e8ef20>
(14, 1200)
<generator object createDataset1.__getitem__ at 0x7fd058e8eeb0>
(16, 900)
<generator object createDataset1.__getitem__ at 0x7fd058e8ef20>
(18, 600)
<generator object createDataset1.__getitem__ at 0x7fd058e8eeb0>
(0, 300)

当我使用train_dataset = createDataset2(x, y),注释掉train_dataset = createDataset1(x, y)时,

输出如下所示:

<class '__main__.createDataset2'>
(2, 0)
2
(2, 0)
0
(4, 2700)
4
(4, 2700)
2700
(6, 2400)
6
(6, 2400)
2400
(8, 2100)
8
(8, 2100)
2100
(10, 1800)
10
(10, 1800)
1800
(12, 1500)
12
(12, 1500)
1500
(14, 1200)
14
(14, 1200)
1200
(16, 900)
16
(16, 900)
900
(18, 600)
18
(18, 600)
600
(0, 300)
0
(0, 300)
300

为什么会这样?

我现在还不清楚,有谁知道的可以告诉我哈,可能是我老眼昏花了,看不出问题在哪里。

现在先在这里作个标记!有谁知道可不可以告知一下呀呀呀?

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值