PyTorch深度学习实践概论笔记8练习-kaggle的Titanic数据集预测（三）构建模型（使用DataLoader类）

本文链接：https://blog.csdn.net/csdn_xmj/article/details/122586999

接着文章PyTorch深度学习实践概论笔记8练习-kaggle的Titanic数据集预测（一）数据分析，我们构建模型来预测人员是否存活，然后提交到 kaggle的Titanic - Machine Learning from Disaster | Kaggle，查看成绩。

1 模型1

3个线性层43-64-16-2。

1.1 使用Dataset和DataLoader类读取数据

导入相应库。

import pandas as pd
import time
import torch
from torch.utils.data import Dataset,DataLoader

#test其实也有label属性，定义Dataset类的时候同理，不是pd的那种处理方式！

定义TitanicDataset类，代码如下：

class TitanicDataset(Dataset):
    def __init__(self,path):
        xy_data = pd.read_csv(path)
        self.len = xy_data.shape[0]
        titanic_data_X = xy_data.drop(['Survived'], axis=1)
        titanic_data_Y = xy_data['Survived']
        # 转化tensor格式
        self.xy_data = torch.from_numpy(titanic_data_X.values).float()
        self.xy_label = torch.from_numpy(titanic_data_Y.values).float()

    def __getitem__(self,index):
            return self.xy_data[index],self.xy_label[index]
    
    def __len__(self):
        return self.len

titanic_train = TitanicDataset("./titanic/titanic_train.csv")
train_loader = DataLoader(dataset=titanic_train,batch_size=16,
                          shuffle=True,num_workers=1)
titanic_test = TitanicDataset("./titanic/titanic_test.csv")
test_loader = DataLoader(dataset=titanic_test,batch_size=16,
                          shuffle=False,num_workers=1)

1.2 构造模型

class Net1(torch.nn.Module):
    def __init__(self):
        super(Net1, self).__init__()
        self.fc = torch.nn.Sequential(
            torch.nn.Linear(43, 64),  #43  
            torch.nn.ReLU(),
            torch.nn.Linear(64, 16),
            torch.nn.ReLU(),
            torch.nn.Linear(16, 2),
            torch.nn.Softmax(dim=1)
        )

    def forward(self, x):
#         print(x.shape)  #torch.Size([16, 43])
        return self.fc(x)

net1 = Net1()
print(net1)

输出结果如下：

Net1(
  (fc): Sequential(
    (0): Linear(in_features=43, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=16, bias=True)
    (3): ReLU()
    (4): Linear(in_features=16, out_features=2, bias=True)
    (5): Softmax(dim=1)
  )
)

1.3 损失函数和优化器

criterion = torch.nn.CrossEntropyLoss()  # 定义损失函数
optimizer = torch.optim.Adam(net1.parameters(), lr=0.001)  # 定义优化器

1.4 训练模型

if __name__ == '__main__':
    start = time.time()
    for epoch in range(5):
        running_loss = 0.0
        for i, data in enumerate(train_loader, 0):
            inputs, labels = data  # 获取数据
            optimizer.zero_grad()  # 清空梯度缓存

            outputs = net1(inputs)
            outputs = outputs.squeeze(-1)
            loss = criterion(outputs, labels.long())
            loss.backward()  
            optimizer.step()  
            running_loss += loss.item()
            if i % 20 == 19:
                # 每 20 次迭代打印一次信息
                print('[%d, %5d] loss: %.3f' % (epoch+1, i+1, running_loss/2000))
                running_loss = 0.0
    print('Finish Training! Total cost time: ', time.time()-start)

输出如下：

[1,    20] loss: 0.007
[1,    40] loss: 0.006
[2,    20] loss: 0.006
[2,    40] loss: 0.005
[3,    20] loss: 0.005
[3,    40] loss: 0.005
[4,    20] loss: 0.005
[4,    40] loss: 0.005
[5,    20] loss: 0.005
[5,    40] loss: 0.005
Finish Training! Total cost time:  0.8926229476928711

1.5 验证模型

#初始化数值
val_correct = 0
total = 0
with torch.no_grad():
    for data in train_loader:
        inputs, labels = data
        outputs = net1(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        val_correct += (predicted == labels).sum().item()
print('Accuracy of the network: %d %%' %
      (val_correct / total * 100))

输出如下：

Accuracy of the network: 85 %

1.6 测试模型

测试模型，作者的最终提交准确率 78.708%。

with torch.no_grad():
    for data in test_loader:
        inputs,labels = data  # 获取数据
        outputs = torch.max(net1(inputs),1)[1]

submission = pd.read_csv('./titanic/gender_submission.csv')
submission['Survived'] = output
submission.to_csv('./titanic/gender_submission_result1.csv', index=False)

模型1的提交准确率为78.708%。

于是，继续优化。

2 模型2

4个线性层加上dropout，42-64-32(p=0.1)-32-16-2

2.2 构造模型

class Net2(nn.Module):
    def __init__(self):
        super(Net2, self).__init__()
        self.fc = nn.Sequential(
            torch.nn.Linear(43, 64),  #43
            torch.nn.ReLU(),
            torch.nn.Linear(64, 32),
            torch.nn.Dropout(p=0.1),
            torch.nn.ReLU(),
            torch.nn.Linear(32, 16),
            torch.nn.ReLU(),
            torch.nn.Linear(16, 2),
            torch.nn.Softmax(dim=1)
        )

    def forward(self, x):
        return self.fc(x)


net2 = Net2()
print(net2)

输出模型如下：

Net2(
  (fc): Sequential(
    (0): Linear(in_features=43, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=32, bias=True)
    (3): Dropout(p=0.1, inplace=False)
    (4): ReLU()
    (5): Linear(in_features=32, out_features=16, bias=True)
    (6): ReLU()
    (7): Linear(in_features=16, out_features=2, bias=True)
    (8): Softmax(dim=1)
  )
)

2.4 训练模型

#训练模型
if __name__ == '__main__':
    start = time.time()
    for epoch in range(10):
        running_loss = 0.0
        for i, data in enumerate(train_loader, 0):
            inputs, labels = data  # 获取数据
            optimizer.zero_grad()  # 清空梯度缓存

            outputs = net2(inputs)
            outputs = outputs.squeeze(-1)
            loss = criterion(outputs, labels.long())
            loss.backward()  
            optimizer.step()  
            running_loss += loss.item()
            if i % 20 == 19:
                # 每 20 次迭代打印一次信息
                print('[%d, %5d] loss: %.3f' % (epoch+1, i+1, running_loss/2000))
                running_loss = 0.0
    print('Finish Training! Total cost time: ', time.time()-start)

输出结果：

[1,    20] loss: 0.007
[1,    40] loss: 0.007
[2,    20] loss: 0.006
[2,    40] loss: 0.005
[3,    20] loss: 0.005
[3,    40] loss: 0.005
[4,    20] loss: 0.004
[4,    40] loss: 0.005
[5,    20] loss: 0.005
[5,    40] loss: 0.005
Finish Training! Total cost time:  2.1204745769500732

2.5 验证模型

Accuracy of the network: 86 %。

2.6 测试模型

模型2的提交准确率为78.947%，提交的时候排名1376，继续优化。

3 模型3

5个线性层加上2个dropout，42-64-128（p=0.1）-32（p=0.1）-32-16-2

3.2 构造模型

class Net3(nn.Module):
    def __init__(self):
        super(Net3, self).__init__()
        self.fc = nn.Sequential(
            torch.nn.Linear(43, 64),  #43
            torch.nn.ReLU(),
            torch.nn.Linear(64, 128),
            torch.nn.Dropout(p=0.1),
            torch.nn.ReLU(),
            torch.nn.Linear(128, 32),
            torch.nn.Dropout(p=0.1),
            torch.nn.ReLU(),
            torch.nn.Linear(32, 16),
            torch.nn.ReLU(),
            torch.nn.Linear(16, 2),
            torch.nn.Softmax(dim=1)
        )

    def forward(self, x):
        return self.fc(x)


net3 = Net3()
print(net3)

输出模型如下：

Net3(
  (fc): Sequential(
    (0): Linear(in_features=43, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=128, bias=True)
    (3): Dropout(p=0.1, inplace=False)
    (4): ReLU()
    (5): Linear(in_features=128, out_features=32, bias=True)
    (6): Dropout(p=0.1, inplace=False)
    (7): ReLU()
    (8): Linear(in_features=32, out_features=16, bias=True)
    (9): ReLU()
    (10): Linear(in_features=16, out_features=2, bias=True)
    (11): Softmax(dim=1)
  )
)

3.4 训练模型

#训练模型
if __name__ == '__main__':
    start = time.time()
    for epoch in range(10):
        running_loss = 0.0
        for i, data in enumerate(train_loader, 0):
            inputs, labels = data  # 获取数据
            optimizer.zero_grad()  # 清空梯度缓存

            outputs = net3(inputs)
            outputs = outputs.squeeze(-1)
            loss = criterion(outputs, labels.long())
            loss.backward()  
            optimizer.step()  
            running_loss += loss.item()
            if i % 20 == 19:
                # 每 20 次迭代打印一次信息
                print('[%d, %5d] loss: %.3f' % (epoch+1, i+1, running_loss/2000))
                running_loss = 0.0
    print('Finish Training! Total cost time: ', time.time()-start)

输出结果：

[1,    20] loss: 0.007
[1,    40] loss: 0.007
[2,    20] loss: 0.005
[2,    40] loss: 0.005
[3,    20] loss: 0.005
[3,    40] loss: 0.005
[4,    20] loss: 0.005
[4,    40] loss: 0.005
[5,    20] loss: 0.005
[5,    40] loss: 0.004
Finish Training! Total cost time:  0.9200258255004883