【kaggle】Spaceship Titanic - 预测哪些乘客被运送到另一个维度【CatBoost - 10%】

一、赛题

Spaceship Titanic - 预测哪些乘客被运送到另一个维度:https://www.kaggle.com/competitions/spaceship-titanic

结果:
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

二、代码(可以直接放到kaggle运行)

# K折
from sklearn.model_selection import KFold
# 基础包
import pandas as pd
import numpy as np

# 模型
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
# 评价指标

from sklearn.metrics import mean_squared_error
train = pd.read_csv('../input/spaceship-titanic/train.csv')
test = pd.read_csv('../input/spaceship-titanic/test.csv')
sample = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')
# 用前一行的值填补空值
train.fillna(method='pad',axis=0,inplace=True)
test.fillna(method='pad',axis=0,inplace=True)
# 类型转换
train['Cabin'] = train['Cabin'].astype(str)
train['PassengerId'] = train['PassengerId'].astype(str)
cabin = train['Cabin']
PassengerId = train['PassengerId']

# 分割数据,插入数据集
from sqlalchemy import null
cabin_list = []
PassengerId_list = []

deck_list = []
num_list = []
side_list = []
Passenger_list = []
Id_list = []

# 分割数据,插入数据集
for i in cabin:
    cabin_list.append(i.split('/'))

for i_1 in cabin_list:
    # 处理cabin
    deck = i_1[0]
    num = int(i_1[1])
    side = i_1[2]
    deck_list.append(deck)
    num_list.append(num)
    side_list.append(side)

for j in PassengerId:
    PassengerId_list.append(j.split('_'))

for j_1 in PassengerId_list:
    Passenger = int(j[0])
    Id = int(j[1])
    Passenger_list.append(Passenger)
    Id_list.append(Id)

train.insert(0,'deck',deck_list)
train.insert(1,'num',num_list)
train.insert(2,'side',side_list)
train.insert(3,'Passenger',Passenger_list)
train.insert(4,'Id',Id_list)
# 类型转换
test['Cabin'] = test['Cabin'].astype(str)
test['PassengerId'] = test['PassengerId'].astype(str)
cabin = test['Cabin']
PassengerId = test['PassengerId']

# 分割数据,插入数据集
from sqlalchemy import null
cabin_list = []
PassengerId_list = []

deck_list = []
num_list = []
side_list = []
Passenger_list = []
Id_list = []

# 分割数据,插入数据集
for i in cabin:
    cabin_list.append(i.split('/'))

for i_1 in cabin_list:
    # 处理cabin
    deck = i_1[0]
    num = int(i_1[1])
    side = i_1[2]
    deck_list.append(deck)
    num_list.append(num)
    side_list.append(side)

for j in PassengerId:
    PassengerId_list.append(j.split('_'))

for j_1 in PassengerId_list:
    Passenger = int(j[0])
    Id = int(j[1])
    Passenger_list.append(Passenger)
    Id_list.append(Id)

test.insert(0,'deck',deck_list)
test.insert(1,'num',num_list)
test.insert(2,'side',side_list)
test.insert(3,'Passenger',Passenger_list)
test.insert(4,'Id',Id_list)
drop_columns = ['Name','HomePlanet','Destination','Cabin','PassengerId']
train.drop(drop_columns,axis=1,inplace=True)
test.drop(drop_columns,axis=1,inplace=True)
# 处理train字符串
for i in range(8693):
    train['deck'][i] = ord(train['deck'][i])
    train['side'][i] = ord(train['side'][i])

# 处理test字符串
for j in range(4277):
    test['deck'][j] = ord(test['deck'][j])
    test['side'][j] = ord(test['side'][j])
train['CryoSleep'] = train['CryoSleep'].astype('int')
train['VIP'] = train['VIP'].astype('int')
train['deck'] = train['deck'].astype('int')
train['side'] = train['side'].astype('int')
test['CryoSleep'] = test['CryoSleep'].astype('int')
test['VIP'] = test['VIP'].astype('int')
test['deck'] = test['deck'].astype('int')
test['side'] = test['side'].astype('int')
train['Transported'] = train['Transported'].astype('int')
c = ['deck','num','side','Passenger','Id', 'CryoSleep', 'Age','VIP','RoomService',
       'FoodCourt','ShoppingMall','Spa','VRDeck']
target = train['Transported']
from sklearn.model_selection import train_test_split
#划分训练集、测试集
train_data, test_data, train_target, test_target = train_test_split(train[c],target, test_size = 0.3)
clf = CatBoostClassifier()
clf.fit(train_data,train_target)
test_pred = clf.predict(test_data)
score =  mean_squared_error(test_target,test_pred)
print(score) # 0.80547
last_pred = clf.predict(zuhe_test)
last_pred = np.array (last_pred, dtype = bool) 
#Kaggle需要提交最终的csv文件,所以输出一个csv文件:
sample['Transported']=last_pred

sample.to_csv('submission.csv', index=False)
  • 3
    点赞
  • 20
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 3
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Bessie_Lee_gogogo

你的鼓励是我最大的动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值