使用自己的数据利用pytorch搭建全连接神经网络进行分类预测
这段代码实现了一个神经网络模型在数据集上的训练和测试。具体流程如下:
1、导入库
引入必要的库,包括PyTorch、Pandas等。
2、自定义函数
(1)定义数据预处理函数zscore(),用于将数据进行标准化处理。
(2)定义MyDataset类,继承自pytorch的Dataset类,用于加载数据集并封装为可迭代对象。
(3)定义神经网络模型Net类,继承自nn.Module类,包含若干个全连接层和Dropout层,以及ReLU激活函数。
(4)定义训练模型函数train_model(),用于对模型进行训练,并返回每次训练的损失值和精度。
(5)定义测试模型函数test_model(),用于对模型进行测试,并返回每次测试的损失值和精度。
3、定义主函数main()
(1)定义超参数,包括输入数据的维度、输出类别数、学习率、训练代数、批次大小、dropout比例等。
(2)加载数据集,提取特征和标签,将数据集分割为训练集和测试集,并进行标准化处理。
(3)创建tensor数据集,创建训练集和测试集的DataLoader对象,用于批量读取数据。
(4)初始化模型、损失函数和优化器。
(5)对模型进行训练,并输出每次训练和测试的损失值和精度。
4、执行
主函数被调用以执行整个程序。
5、完整代码部分
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
def zscore(train, test):
standardScaler = preprocessing.StandardScaler()
standardScaler.fit(train)
train_data = standardScaler.transform(train)
test_data = standardScaler.transform(test)
return train_data, test_data
class MyDataset(Dataset):
def __init__(self, data, labels):
self.data = data
self.labels = labels
def __len__(self):
return len(self.data)
def __getitem__(self, index):
sample = torch.tensor(self.data[index], dtype=torch.float32)
label = torch.tensor(self.labels[index], dtype=torch.long)
return sample, label
# 定义神经网络模型
class Net(nn.Module):
def __init__(self, input_size, output_size, dropout_prob):
super(Net, self).__init__()
self.fc1 = nn.Linear(input_size, 128)
self.fc2 = nn.Linear(128, 512)
self.fc3 = nn.Linear(512, 256)
self.fc4 = nn.Linear(256, 128)
self.fc5 = nn.Linear(128, output_size)
self.dropout = nn.Dropout(dropout_prob)
self.relu = nn.ReLU()
def forward(self, x):
x = self.dropout(self.relu(self.fc1(x)))
x = self.dropout(self.relu(self.fc2(x)))
x = self.dropout(self.relu(self.fc3(x)))
x = self.dropout(self.relu(self.fc4(x)))
x = self.fc5(x)
return x
# 训练模型
def train_model(model, train_loader, criterion, optimizer):
model.train()
running_loss = 0.0
correct = 0
total = 0
for i, data in enumerate(train_loader, 0):
inputs, labels = data
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
train_loss = running_loss / len(train_loader)
train_acc = 100 * correct / total
return train_loss, train_acc
# 测试模型
def test_model(model, test_loader, criterion):
model.eval()
running_loss = 0.0
correct = 0
total = 0
with torch.no_grad():
for data in test_loader:
inputs, labels = data
outputs = model(inputs)
loss = criterion(outputs, labels)
running_loss += loss.item()
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
test_loss = running_loss / len(test_loader)
test_acc = 100 * correct / total
return test_loss, test_acc
# 主函数
def main():
# 定义超参数
input_size = 9 # 输入数据的维度,可参考feature_select的个数
output_size = 4 # 类别数
learning_rate = 0.005 # 学习率
num_epochs = 300 # 训练代数
batch_size = 128 # 每次训练的批次大小
dropout_prob = 0.2 # dropout的比例
data_path = 'dataset.csv' #数据的路径
feature_select = ['timestamp', 'day_of_week', 'is_weekend', 'is_holiday', 'temperature',
'is_start_of_semester', 'is_during_semester', 'month', 'hour']
# 定义数据预处理操作,图像专用
# transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(0, 1)])
# load data
data = pd.read_csv(data_path, engine='python')
# 提取特征和标签
x_data, y_data = data[feature_select], data["label"]
test_size = 0.25
# 测试集大小为20%, 80%/20%分
x_train, x_test, y_train, y_test = train_test_split(x_data.values, y_data.values, test_size=test_size, random_state=0)
# 标准化
x_train, x_test = zscore(x_train, x_test)
# 加载数据集
train_dataset = MyDataset(x_train, y_train)
test_dataset = MyDataset(x_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
# 初始化模型、损失函数和优化器
model = Net(input_size, output_size, dropout_prob)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# 训练模型
for epoch in range(num_epochs):
train_loss, train_acc = train_model(model, train_loader, criterion, optimizer)
test_loss, test_acc = test_model(model, test_loader, criterion)
print('Epoch [{}/{}], Train Loss: {:.4f}, Train Acc: {:.2f}%, Test Loss: {:.4f}, Test Acc: {:.2f}%'.format(
epoch + 1, num_epochs, train_loss, train_acc, test_loss, test_acc))
if __name__ == '__main__':
main()