【Pytorch】从0开始的数据集制作,从numpy数组到Dataset,再到Dataloader
所需import的库
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
1.首先收集想要解决问题的场景下的数据,这里我是自己生成了一组数据(1000个点的sin函数带有噪声)
'''
首先生成一个sin函数,作为伪时间序列数据
'''
T = 1000
x = torch.arange(1, T + 1, dtype=torch.float32)
y = torch.sin(0.01 * x) + torch.normal(0, 0.1, (T,))#每个y加上一个0到0.2(左闭右开)的噪声
plt.plot(x, y)
plt.show()
数据绘制的图为:
2.我的场景是利用序列的前8个数,预测第9个数,所以在上面数据的基础之上,我要自己切分数据,形成数据集的基本特点(features-target对)
def data_prediction(data, num_features):
'''
准备数据集的函数
'''
features, target = [], []
for i in range(len(data)-num_features):
f = data[i:i+num_features]
t = data[i+num_features]
features.append(list(f))
target.append(t)
return np.array(features), np.array(target)
下面就是直接调用写好的函数,返回features和target数据包
'''
第一步,创建好数据集,即对应好训练数据与目标数据#
我们的目标是用前面序列前面八个值的大小预测第9个值
第一条feature-target应该为 features:(y0, y1, y2...,y7),target:(y8)
最后一条feature-target应该为 features:(y991, y992, y993...,y998),target:(y999)
'''
dataset_features, dataset_target = data_prediction(y, 8)
3.数据集的拆分,我是精确地拆成了四个数组,分别是训练集的特征,训练集的target,测试集的特征,测试集的target
def dataset_split(data_features, data_target, ratio=0.8):
'''
功能:训练集与测试集的特征与target分离
ratio:表示训练集所占的百分比
'''
split_index = int(ratio*len(data_features))
train_features = data_features[:split_index]
train_target = data_target[:split_index]
test_features = data_features[split_index:]
test_target = data_target[split_index:]
return train_features, train_target, test_features, test_target
然后将步骤2的输出调用拆分成我们需要的几个数据包:
'''
第二步,将数据集进行拆分,分成训练集和测试集
'''
trian_features, train_target, test_features, test_target = dataset_split(dataset_features, dataset_target)
4.然后就是将上面准备好的数据写成Dataset类,因为Datalader接受dataset的参数时,必须是Dataset类,这一步是难点,因为我们需要自己写一个Dataset类,根据官方的定义,只要在我们自己的类里面定义getitem和len的魔术方法就能正确的创建一个成功继承Dataset类的子类,更详细的内容请参考这篇:有关Dataset和Dataloader
class dataset_prediction(Dataset):
'''
将传入的数据集,转成Dataset类,方面后续转入Dataloader类
注意定义时传入的data_features,data_target必须为numpy数组
'''
def __init__(self, data_features, data_target):
self.len = len(data_features)
self.features = torch.from_numpy(data_features)
self.target = torch.from_numpy(data_target)
def __getitem__(self, index):
return self.features[index], self.target[index]
def __len__(self):
return self.len
实例化我们所创建的类:
'''
第三步,将刚才的数据集转换成Dataset类
'''
train_set = dataset_prediction(data_features=trian_features, data_target=train_target)
test_set = dataset_prediction(data_features=test_features, data_target=test_target)
print(len(train_set))
print(len(test_set))
5.将步骤4处理好的数据转换成Datalader类,下一步就可以送入训练函数了
'''
第四步,将其传入DataLoader中
'''
BATCH_SIZE = 16
train_set_iter = DataLoader(dataset=train_set,
batch_size=BATCH_SIZE,
shuffle=True,
drop_last=True)
print('开始训练')
for batch_index, (x, y) in enumerate(train_set_iter):
print('batch_index:', batch_index)
print(x.shape)
print(y.shape)
test_set_iter = DataLoader(dataset=test_set,
batch_size=BATCH_SIZE,
shuffle=True,
drop_last=True)
print('开始测试')
for batch_index, (x, y) in enumerate(test_set_iter):
print('batch_index:', batch_index)
print(x.shape)
print(y.shape)
7.下面是整个py文件,可以直接运行
# 用户:Ejemplarr
# 编写时间:2022/3/24 22:11
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torch
import matplotlib.pyplot as plt
'''
首先生成一个sin函数,作为伪时间序列数据
'''
T = 1000
x = torch.arange(1, T + 1, dtype=torch.float32)
y = torch.sin(0.01 * x) + torch.normal(0, 0.1, (T,))#每个y加上一个0到0.2(左闭右开)的噪声
# data = torch.arange(1, T + 1, dtype=torch.float32)
'''
数据集的制作:
1.首先将数据包装为Dataset类
2.使用torch.utils.data.random_split对已经包装完的数据进行数据集拆分,分为训练集和测试集
3.然后传入DataLoader中,然后就是方便做mini-batch与shuffle操作了。
'''
def data_prediction(data, num_features):
'''
准备数据集的函数
'''
features, target = [], []
for i in range(len(data)-num_features):
f = data[i:i+num_features]
t = data[i+num_features]
features.append(list(f))
target.append(t)
return np.array(features), np.array(target)
class dataset_prediction(Dataset):
'''
将传入的数据集,转成Dataset类,方面后续转入Dataloader类
注意定义时传入的data_features,data_target必须为numpy数组
'''
def __init__(self, data_features, data_target):
self.len = len(data_features)
self.features = torch.from_numpy(data_features)
self.target = torch.from_numpy(data_target)
def __getitem__(self, index):
return self.features[index], self.target[index]
def __len__(self):
return self.len
def dataset_split(data_features, data_target, ratio=0.8):
'''
功能:训练集与测试集的特征与target分离
ratio:表示训练集所占的百分比
'''
split_index = int(ratio*len(data_features))
train_features = data_features[:split_index]
train_target = data_target[:split_index]
test_features = data_features[split_index:]
test_target = data_target[split_index:]
return train_features, train_target, test_features, test_target
if __name__ == '__main__':
plt.plot(x, y)
plt.show()
'''
第一步,创建好数据集,即对应好训练数据与目标数据#
我们的目标是用前面序列前面八个值的大小预测第9个值
第一条feature-target应该为 features:(y0, y1, y2...,y7),target:(y8)
最后一条feature-target应该为 features:(y991, y992, y993...,y998),target:(y999)
'''
dataset_features, dataset_target = data_prediction(y, 8)
# print(dataset_features)
# print(dataset_target)
'''
第二步,将数据集进行拆分,分成训练集和测试集
'''
trian_features, train_target, test_features, test_target = dataset_split(dataset_features, dataset_target)
'''
第三步,将刚才的数据集转换成Dataset类
'''
train_set = dataset_prediction(data_features=trian_features, data_target=train_target)
test_set = dataset_prediction(data_features=test_features, data_target=test_target)
# data_set = dataset_prediction(data_features=dataset_features, data_target=dataset_target)
'''
第四步,将其传入DataLoader中
'''
BATCH_SIZE = 16
train_set_iter = DataLoader(dataset=train_set,
batch_size=BATCH_SIZE,
shuffle=True,
drop_last=True)
print('开始训练')
for batch_index, (x, y) in enumerate(train_set_iter):
print('batch_index:', batch_index)
print(x.shape)
print(y.shape)
test_set_iter = DataLoader(dataset=test_set,
batch_size=BATCH_SIZE,
shuffle=True,
drop_last=True)
print('开始测试')
for batch_index, (x, y) in enumerate(test_set_iter):
print('batch_index:', batch_index)
print(x.shape)
print(y.shape)
谢谢阅读!!!欢迎交流!!!