泰坦尼克号宇宙飞船(深度学习)

背景

欢迎来到2912年,这里需要你的数据科学技能来解决一个宇宙之谜。我们收到了来自四光年外的信号情况不妙。

泰坦尼克号宇宙飞船是一个月前发射的一艘星际客轮。载着近1.3万名乘客,这艘船开始了它的处女航,将移民从我们的太阳系运送到围绕附近恒星运行的三颗新适宜居住的系外行星。

在环绕半人马座阿尔法星前往第一个目的地——炽热的巨蟹座55 e的途中,粗心的“泰坦尼克”号飞船与隐藏在尘埃云中的时空异常物相撞。可悲的是,它遭遇了与1000年前同名的命运相似的命运。虽然飞船完好无损,但几乎一半的乘客被送到了另一个维度!
在这里插入图片描述
为了帮助救援人员和找回失踪的乘客,你面临的挑战是从飞船受损的计算机系统中恢复的记录中预测哪些乘客被异常运输了。


数据集预处理与深度学习模型建构如下:

import numpy as np
import pandas as pd
import torch
from torch.utils import data as data_loader
from torch import nn
from matplotlib import pyplot as plt
import seaborn as sns
from wk import utils

from sklearn.preprocessing import MinMaxScaler, StandardScaler

sns.set(style='darkgrid')
pd.set_option('display.max_columns', None)

train, test = pd.read_csv('F:\\deeplearning\\database\\spaceship-titanic\\train.csv'), pd.read_csv(
    'F:\\deeplearning\\database\\spaceship-titanic\\test.csv')

train_num = train.shape[0]
data = pd.concat([train, test], ignore_index=True)

data.drop(['Name'], axis=1, inplace=True)

data.HomePlanet.fillna('Uknow', inplace=True)
data.Destination.fillna('TRAPPIST-1e', inplace=True)  # 无关紧要, TRAPPIST-1e众数
data.CryoSleep.fillna(False, inplace=True)  # 无关紧要, false为众数
data.VIP.fillna('Uknow', inplace=True)  # 影响大, 与VIP数量相当

data.Age.fillna(data.Age.mean(), inplace=True)
data.RoomService.fillna(data.RoomService.mean(), inplace=True)
data.FoodCourt.fillna(data.FoodCourt.mean(), inplace=True)
data.ShoppingMall.fillna(data.ShoppingMall.mean(), inplace=True)
data.Spa.fillna(data.Spa.mean(), inplace=True)
data.VRDeck.fillna(data.VRDeck.mean(), inplace=True)

# 客舱号、乘客编号信息提取
data['Cabin_Room'] = data['Cabin'].str.extract('([A-Z])/', expand=False)
data.Cabin_Room.fillna('U', inplace=True)  # 有影响
data['Cabin_Num'] = data['Cabin'].str.extract('/([0-9]+)/', expand=False)
data['Cabin_Side'] = data['Cabin'].str.extract('/([A-Z])', expand=False)
data.Cabin_Side.fillna('P', inplace=True)  # 影响较小 299

data['Family_id'] = data['PassengerId'].str.extract('(\d+)_', expand=False)
data.sort_values(by='Family_id', inplace=True)
data['Family_num'] = data['PassengerId'].str.extract('_(\d+)', expand=False)
data['Family'] = data.groupby('Family_id')['Family_num'].transform('size')
data.sort_index(inplace=True)
data.drop(['Family_id', 'Family_num', 'PassengerId', 'Cabin'], axis=1, inplace=True)

# print(data.info())
#
# sns.histplot(x="HomePlanet", hue="Transported", data=data, multiple="stack", palette=['red', 'blue'])
# plt.show()
#
# sns.histplot(x="CryoSleep", hue="Transported", data=data, multiple="stack", palette=['red', 'blue'])
# plt.show()
#
# sns.histplot(x="Destination", hue="Transported", data=data, multiple="stack", palette=['red', 'blue'])
# plt.show()
#
# sns.histplot(x="VIP", hue="Transported", data=data, multiple="stack", palette=['red', 'blue'])
# plt.show()
#
# sns.histplot(x="Cabin_Side", hue="Transported", data=data, multiple="stack", palette=['red', 'blue'])
# plt.show()
#
# sns.displot(x="Cabin_Num", data=data)
# plt.show()

# def CabinNum_classification(age):
#     return

# 特征转换
data = pd.concat([data, pd.get_dummies(data.Destination, prefix='Destination')], axis=1)
# data.Destination = data.Destination.map({'TRAPPIST-1e': 0, '55 Cancri e': 1, 'PSO J318.5-22': 2})
data = pd.concat([data, pd.get_dummies(data.HomePlanet, prefix='HomePlanet')], axis=1)
data.Transported = data.Transported.map({False: 0, True: 1})
data.CryoSleep = data.CryoSleep.map({False: 0, True: 1})
data.Cabin_Side = data.Cabin_Side.map({'P': 0, 'S': 1})
data.Cabin_Num.fillna(data.Cabin_Num.astype('float').mean().astype('int'), inplace=True)
data.Cabin_Num = (data.Cabin_Num.astype('int') / 100).astype('int')
data.Cabin_Room = data.Cabin_Room.map({'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'U': 7, 'T': 8})
data.VIP = data.VIP.map({False: 0, True: 1, 'Uknow': 2})
data = pd.concat([data, pd.get_dummies(data.VIP, prefix='VIP')], axis=1)
data.drop(['Destination', 'HomePlanet'], axis=1, inplace=True)
# sns.kdeplot(x='Cabin_Num', data=data)
# plt.show()
#
# sns.displot(x="Cabin_Num", hue='Transported', data=data, multiple="stack")
# plt.show()


# 数据集分割
features_all, labels_all = data.drop('Transported',  axis=1), data['Transported']
features = features_all.loc[:train_num-1,:]
features_test = features_all.loc[train_num:,:]
labels = labels_all.dropna()

# X_train, X_test, y_train, y_test = train_test_split(features_train, labels, test_size=0.2)
# X_train = pd.concat([X_train, X_train, X_train, X_train], axis=0)
# y_train = pd.concat([y_train, y_train, y_train, y_train], axis=0)

# 数据集归一化
mm = MinMaxScaler()
std = StandardScaler()
# X_train = mm.fit_transform(X_train)
# X_test = mm.transform(X_test)
features = std.fit_transform(features)
features_test = std.transform(features_test)
# features = mm.fit_transform(features)
# features_test = mm.transform(features_test)

features = torch.tensor(features, dtype=torch.float)
labels = torch.tensor(labels, dtype=torch.long).reshape(-1)
# features_2 = torch.tensor(X_test, dtype=torch.float)
# labels_2 = torch.tensor(y_test.values, dtype=torch.long).reshape(-1)
features_predidct = torch.tensor(features_test, dtype=torch.float)

def load_array(data_arrays, batch_size, is_train=True):  # 布尔值is_train表⽰是否希望数据迭代器对象在每个迭代周期内打乱数据
    """构造⼀个PyTorch数据迭代器"""
    dataset = data_loader.TensorDataset(*data_arrays)  # 解包操作, 相当于data.TensorDatase(features, labels, batch_size)
    return data_loader.DataLoader(dataset, batch_size, shuffle=is_train)


# 初始化模型参数
# def init_parameters(m):
#     if type(m) == nn.Linear:
#         nn.init.normal_(m.weight, std=0.01)
#         # nn.init.zeros_(m.bias)

class BinaryModel(nn.Module):
    def __init__(self, lr):
        super().__init__()
        self.modle = nn.Sequential(nn.Linear(22, 15), nn.ReLU(), nn.Linear(15, 6), nn.ReLU(), nn.Linear(6,6), nn.ReLU(), nn.Linear(6,2))
        # self.loss = nn.BCEWithLogitsLoss(reduction='mean')
        self.loss = nn.CrossEntropyLoss(reduction='mean')
        self.optim = torch.optim.SGD(self.parameters(), lr=lr)

    def forward(self, X, y):
        y_hat = self.modle(X)
        # y = nn.functional.one_hot(y, 2).float()
        # l = self.loss(y_hat.reshape(-1), y.float())
        l = self.loss(y_hat, y)
        # return l, torch.from_numpy(np.where(nn.functional.sigmoid(y_hat).view(-1).detach().numpy() > 0.5, 1, 0)).long()
        return l, y_hat

batch_size = 256
data_iter_1 = load_array((features, labels), batch_size)

num_epochs, lr = 200, 0.2
net = BinaryModel(lr)
utils.train_model(num_epochs, net, data_iter_1, test_iter=data_iter_1, train_with_gpu=True)

# net.to(utils.get_gpu())
y = net.modle(features_predidct.to(utils.get_gpu())).argmax(axis=1).cpu()
predict = pd.Series(y.detach().numpy()).map({0:False, 1:True}).to_numpy().reshape(-1)
passenger_id = test.loc[:, 'PassengerId']
predDf = pd.DataFrame({'PassengerId':passenger_id, 'Transported':predict})
predDf.to_csv('sample_submission.csv', index=False)

print('over.')


在模型训练过200个周期后,训练准确率和测试准确率都达到了82%.
在这里插入图片描述

提交预测结果后的得分为79分.
在这里插入图片描述

  • 2
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值