对数据集的分析涉及特征工程的知识了,网上有详细教程。本文的主要目的是对pytorch进行练习。我主要挑选一下重要的信息作为特征进行训练。
- PassengerID(ID)
- Survived(存活与否)
- Pclass(客舱等级,较为重要)
- Name(姓名,可提取出更多信息)
- Sex(性别,较为重要)
- Age(年龄,较为重要)
- Parch(直系亲友)
- SibSp(旁系)
- Ticket(票编号)
- Fare(票价)
- Cabin(客舱编号)
- Embarked(上船的港口编号)
主要思路:
主要选取的特征 客舱等级、性别、旁系、直系亲友、票价和年龄,年龄是有数据缺失的,我做了最简单处理就是取平均值填充(但是我发现不加年龄最后kaggle得分更高)最终的kaggle得分是76左右。
下面是详细代码,代码使用GPU加速训练了:
import torch
import pandas as pd
from torch.utils.data import Dataset,DataLoader
import numpy as np
#定义一个cpu
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# prepare dataset
# 训练集
class Train_TitanicDataset(Dataset):
def __init__(self, filepath):
#导入数据
data = pd.read_csv(filepath)
labels = ["Survived"]
features = ["Pclass", "Sex", "SibSp", "Parch", "Fare","Age"]
data["Age"] = data["Age"].fillna(data["Age"].mean())
self.len = data.shape[0] # shape(行,列)
self.train_data = data[:int(self.len*0.8)] #训练集占80%
# data[features]的类型是DataFrame,先进行独热表示,然后转成array,最后转成tensor用于进行矩阵计算。
self.x_data = torch.from_numpy(np.array(pd.get_dummies(self.train_data[features])).astype(np.float32))
self.y_data = torch.from_numpy(np.array(self.train_data[labels]).astype(np.float32))
self.train_len = self.train_data.shape[0]
def __getitem__(self, index):
return self.x_data[index], self.y_data[index]
def __len__(self):
return self.train_len
train_dataset = Train_TitanicDataset('train.csv')
train_loader = DataLoader(dataset = train_dataset, batch_size = 32, shuffle = True, num_workers = 2)
# 验证集
class Dev_TitanicDataset(Dataset):
def __init__(self, filepath):
#导入数据
data = pd.read_csv(filepath)
labels = ["Survived"]
features = ["Pclass", "Sex", "SibSp", "Parch", "Fare","Age"]
data["Age"] = data["Age"].fillna(data["Age"].mean())
self.len = data.shape[0] # shape(行,列)
self.dev_data = data[int(self.len*0.8):] #验证集占20%
# data[features]的类型是DataFrame,先进行独热表示,然后转成array,最后转成tensor用于进行矩阵计算。
self.x_data = torch.from_numpy(np.array(pd.get_dummies(self.dev_data[features])).astype(np.float32))
self.y_data = torch.from_numpy(np.array(self.dev_data[labels]).astype(np.float32))
self.dev_len = self.dev_data.shape[0]
def __getitem__(self, index):
return self.x_data[index], self.y_data[index]
def __len__(self):
return self.dev_len
dev_dataset = Dev_TitanicDataset('train.csv')
dev_loader = DataLoader(dataset = dev_dataset, batch_size = 8, shuffle = False, num_workers = 2)
# design model using class
class Model(torch.nn.Module):
def __init__(self):
super(Model, self).__init__()
# 选取的五个特征经过独热表示后变为6维,性别多了一维。
self.linear1 = torch.nn.Linear(7, 6)
self.linear2 = torch.nn.Linear(6, 6)
self.linear3 = torch.nn.Linear(6, 3)
self.linear4 = torch.nn.Linear(3, 2)
self.linear5 = torch.nn.Linear(2, 1)
self.relu = torch.nn.ReLU()
self.sigmoid = torch.nn.Sigmoid()
def forward(self, x):
x = self.relu(self.linear1(x))
x = self.relu(self.linear2(x))
x = self.relu(self.linear3(x))
x = self.relu(self.linear4(x))
x = self.sigmoid(self.linear5(x))
return x
model = Model()
model.to(device)
# construct loss and optimizer
criterion = torch.nn.BCELoss(reduction = 'mean')
optimizer = torch.optim.Adam(model.parameters(),
lr=0.01,
betas=(0.9, 0.999),
eps=1e-08,
weight_decay=0,
amsgrad=False)
# training cycle forward, backward, update
def train(epoch):
train_loss = 0.0
count = 0.0
for i, data in enumerate(train_loader, 0): # start = 0,train_loader 是先shuffle后mini_batch
#inputs, labels = data
inputs, labels = data[0].to(device), data[1].to(device) # 使用gpu训练
y_pred = model(inputs)
loss = criterion(y_pred, labels)
#print(epoch, i, loss.item())
optimizer.zero_grad()
loss.backward()
optimizer.step()
train_loss += loss.item()+ 1e-6
count = i
print('epoch:', epoch+1, 'train loss:', train_loss/count, end = ',')
# 验证
def dev():
with torch.no_grad():
correct = 0.0
total = 0.0
dev_mean_loss = 0.0
for i, data in enumerate(dev_loader, 0):
inputs, labels = data[0].to(device), data[1].to(device)
outputs = model(inputs)
dev_loss = criterion(outputs, labels)
dev_mean_loss += dev_loss.item()+ 1e-6
total += labels.size(0)
correct += (np.round(outputs.cpu()).to(device) == labels).sum().item()
acc = correct / total
count = i
print('dev loss:', dev_mean_loss/count,'Accuracy on dev set:', acc)
if __name__ =='__main__':
for epoch in range(100):
train(epoch)
dev()
#测试并保存预测结果为csv文件
test_data = pd.read_csv('test.csv')
features = ["Pclass", "Sex", "SibSp", "Parch", "Fare","Age"]
test_data["Age"] = test_data["Age"].fillna(test_data["Age"].mean())
test = torch.from_numpy(np.array(pd.get_dummies(test_data[features])).astype(np.float32)).to(device)
with torch.no_grad():
y_pred = model(test)
y = []
for i in y_pred:
if i>=0.5: #四舍五入,y>=0.5认为存活,否则视为死亡
y.append(1)
else:
y.append(0)
output = pd.DataFrame({'PassengerId':test_data.PassengerId, 'Survived': y})
output.to_csv('my_predict.csv', index=False)