使用pytorch训练自己的数据集,用到Dataset和DataLoader
1、定义GetLoader类,继承Dataset方法,并重写__getitem__()和__len__()方法。
import torch
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader
from torchsummary import summary
from torch import nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
# 定义GetLoader类,继承Dataset方法,并重写__getitem__()和__len__()方法
class GetLoader(torch.utils.data.Dataset):
# 初始化函数,得到数据
def __init__(self, data_root, data_label):
self.data = data_root
self.label = data_label
# index是根据batchsize划分数据后得到的索引,最后将data和对应的labels进行一起返回
def __getitem__(self, index):
data = self.data[index]
labels = self.label[index]
return data, labels
# 该函数返回数据大小长度,目的是DataLoader方便划分,如果不知道大小,DataLoader会报错
def __len__(self):
return len(self.data)
csv_file = 'data/heart.csv'
df = pd.read_csv(csv_file)
# 将object数据转换成离散数值
df['thal'] = pd.Categorical(df['thal'])
df['thal'] = df.thal.cat.codes
# 为了正确评估模型性能,将数据划分为训练集和测试集,并在训练集上训练模型,在测试集上验证模型性能。
source_label = df['target']
source_data = df[[x for x in df.columns if x != 'target']]
# 测试集大小为20%, 80%/20%分
x_train, x_test, y_train, y_test = train_test_split(source_data, source_label, test_size=0.2,
random_state=2020)
# 通过GetLoader将数据进行加载,返回Dataset对象,包含data和labels
torch_train_data = GetLoader(x_train.values, y_train.values)
torch_test_data = GetLoader(x_test.values, y_test.values)
batch_size = 2
# 读取数据
train_dataloader = DataLoader(torch_train_data, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(torch_test_data, batch_size=batch_size, shuffle=False)
# # 查看数据
for X, y in train_dataloader:
# i表示第几个batch, data表示该batch对应的数据,包含data和对应的labels
print("X shape:{} {}".format(X.shape, X.dtype))
print("y shape:{} {}".format(y.shape, y.dtype))
break
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
# 自定义网络结构
class LinearNet(nn.Module):
def __init__(self):
super(LinearNet, self).__init__()
# 定义两层全连接层
self.linear_relu = nn.Sequential(
nn.BatchNorm1d(13),
nn.Linear(13, 20),
nn.ReLU(),
nn.Linear(20, 10),
nn.ReLU(),
nn.Linear(10, 1)
)
def forward(self, x):
x = self.linear_relu(x)
return x
model = LinearNet().to(device)
# 查看模型结构
print(model)
# summary(LinearNet(), (13,))
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
def data_train(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
model.train()
for batch, (X, y) in enumerate(dataloader):
X, y = X.to(device), y.to(device)
y = y.view(-1, 1)
# Compute prediction error
pred = model(X.float())
loss = loss_fn(pred, y.float())
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), batch * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
def data_test(dataloader, model, loss_fn):
size = len(dataloader.dataset)
num_batches = len(dataloader)
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
for X, y in dataloader:
X, y = X.to(device), y.to(device)
y = y.view(-1, 1)
pred = model(X.float())
test_loss += loss_fn(pred, y.float()).item()
correct += (pred.argmax(1) == y).type(torch.float).sum().item()
test_loss /= num_batches
acc = correct / (size * batch_size)
print(f"Test Error: \n Accuracy: {(100 * acc):>0.4f}%, Avg loss: {test_loss:>8f} \n")
epoch = 5
for t in range(epoch):
print("Epoch {}------------".format(t + 1))
data_train(train_dataloader, model, loss_fn, optimizer)
data_test(test_dataloader, model, loss_fn)
print("Done!")
# # 保存模型
# # 保存网络中的参数, 速度快,占空间少
# path = 'model/model01.pth'
# torch.save(model.state_dict(), path)
# print("saved model!")
# # 加载模型
# model = LinearNet()
# model.load_state_dict(torch.load(path))
#
# model.eval()
# x, y = torch_test_data[0][0], torch_test_data[0][1]
# x = torch.tensor(x, dtype=torch.float)
# print(x)
# with torch.no_grad():
# pred = model(x)
# predicted, actual = pred[0].argmax(0), y
# print("Predicted: {}, Actual: {}".format(predicted, actual))
使用flake8进行检测:
flake8 项目名字 --max-line-length=120