kaggle-泰塔尼克号-GBDT-快速分析

import pandas as pd
import numpy as np

#-------------------
fileName = ('E:/kaggle/Titanic/train.csv')
df = pd.read_csv(fileName,header=0,usecols=['Survived','Pclass','Sex','Age','SibSp','Fare'])
m,n = df.shape
df_init = df
#-------------------------
for j in range(m):
    if df_init.loc[j,'Sex'] == 'male':
        df_init.loc[j,'Sex'] = 0
    else:
        df_init.loc[j,'Sex'] = 1
df_init = df_init.reindex(columns=['Age','Pclass','Sex','SibSp','Fare','Survived'])
#----------------------------
df_init_fillage = df_init
age_train = df_init_fillage[df_init_fillage['Age'].notnull()].as_matrix(columns=None)
age_null = df_init_fillage[df_init_fillage['Age'].isnull()].as_matrix(columns=None)
from sklearn.ensemble import RandomForestRegressor
randomf1 =RandomForestRegressor()
randomf1.fit(age_train[::,1:5],age_train[::,0])
age_predict = randomf1.predict(age_null[::,1:5])
df_init.loc[df_init['Age'].isnull(),'Age'] = age_predict
#------------------------------

x_train = df_init.iloc[::,:5].as_matrix()
y_train = df_init.iloc[::,5].as_matrix().astype(int)
x_test = df_init.iloc[500:,:5].as_matrix()
y_test = df_init.iloc[500:,5].as_matrix().astype(int)

#---------------------------------
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train,y_train)
print('LogisticRegression score = %f'%lr.score(x_test,y_test))


from sklearn.svm import SVC
vm = SVC()
vm.fit(x_train,y_train)
print('SVM score = %f'%vm.score(x_test,y_test))

from sklearn.ensemble import RandomForestClassifier
randomf = RandomForestClassifier()
randomf.fit(x_train,y_train)
print('RandomForest score = %f'%randomf.score(x_test,y_test))

from sklearn.ensemble import GradientBoostingClassifier
gbdt = GradientBoostingClassifier()
gbdt.fit(x_train,y_train)
print('GBDT score = %f'%randomf.score(x_test,y_test))

from sklearn.neural_network import  MLPClassifier
mlp = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(6, 5), random_state=1)
mlp.fit(x_train,y_train)
print('MLPC score = %f'%mlp.score(x_test,y_test))

#-------------------predict_process--------------------
df_test = pd.read_csv('E:/kaggle/Titanic/test.csv',usecols=['Pclass','Sex','Age','SibSp','Fare'])
m_t,n_t = df_test.shape
df_init_test = df_test
#-------------------------
for j in range(m_t):
    if df_init_test.loc[j,'Sex'] == 'male':
        df_init_test.loc[j,'Sex'] = 0
    else:
        df_init_test.loc[j,'Sex'] = 1
df_init_test= df_init_test.reindex(columns=['Age','Pclass','Sex','SibSp','Fare'])
age_null_test = df_init_test[df_init_test['Age'].isnull()].as_matrix(columns=None)
age_predict_test = randomf1.predict(age_null_test[::,1:5])
df_init_test.loc[df_init_test['Age'].isnull(),'Age'] = age_predict_test
test_data = df_init_test.as_matrix(columns=None)
predict_test = gbdt.predict(test_data)
df_init_test['Survived'] = predict_test
df_init_test.to_csv('E:/kaggle/Titanic/test2.csv')


源代码,删除部分列,简化模型,用GBDT做模型,年龄用随机森林预测插值。


  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值