kaggle之泰坦尼克号

思路:二分法,先预处理数据,提取特征,得到可以训练的数据。最后进行了模型融合




import pandas as pd
import handleData as hd
import numpy as np
import sklearn.linear_model as linear_model
import sklearn.preprocessing as preprocessing
from sklearn.externals import joblib
from sklearn.ensemble import BaggingRegressor


def preprocessions(data_train):#preprocession Age and Fare to avert not lim
        #  rand (0,1)
    min_max_scaler = preprocessing.MinMaxScaler()
    age_scale_param=min_max_scaler.fit(data_train['Age'].reshape(-1, 1))
    data_train['Age_scaler'] = min_max_scaler.fit_transform(data_train['Age'].reshape(-1, 1),age_scale_param)
    #print(data_train['Age_scaler'] )


    fare_scale_param = min_max_scaler.fit(data_train['Fare'].reshape(-1,1))
    data_train['Fare_scaler'] = min_max_scaler.fit_transform(data_train['Fare'].reshape(-1,1), fare_scale_param)


    return data_train






def quantity(data_train):#quantity




    dummies_Cabin = pd.get_dummies(data_train['Cabin'], prefix= 'Cabin')
    dummies_Embarked=pd.get_dummies(data_train['Embarked'],prefix='Embarked')
    dummies_Sex=pd.get_dummies(data_train['Sex'],prefix='Sex')
    dummies_Pclass=pd.get_dummies(data_train['Pclass'],prefix='Pclass')
    df=pd.concat([data_train,dummies_Cabin,dummies_Embarked,dummies_Sex,dummies_Pclass],axis=1)
    df.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
    return df




def buildmodle(df):#build logistic Regression
    train_df = df.filter(regex='Survived|Age_scaler|SibSp|Parch|Fare_scaler*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
    df = train_df.as_matrix()


    y = df[:, 0]#Survived number


    X=df[:, 1:]#feature for y


    #use linear_model to build logistic regerssion


    clf=linear_model.LogisticRegression(C=1.0, tol=1e-6, dual=False)
    Bagging_clf=BaggingRegressor(clf, n_estimators=20, max_samples=0.8, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=-1)
    Bagging_clf.fit(X,y)
    return Bagging_clf,train_df
def test_datas(rfrs):
    data_test = pd.read_csv("test.csv")
    data_train.loc[(data_train.Age<=12),'Age']=1#将年龄小于12的置为1
    data_train.loc[(data_train.Age>12), 'Age']=0#将年龄大于12的置为0
    data_test.loc[ (data_test.Fare.isnull()), 'Fare' ] = 0
    # 接着我们对test_data做和train_data中一致的特征变换
    # 首先用同样的RandomForestRegressor模型填上丢失的年龄
    tmp_df = data_test[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]
    null_age = tmp_df[data_test.Age.isnull()].as_matrix()
    # 根据特征属性X预测年龄并补上
    X = null_age[:, 1:]
    predictedAges = rfrs.predict(X)


    data_test.loc[ (data_test.Age.isnull()), 'Age' ] = predictedAges
    data_real_train=hd.set_Cabin(data_test)
    data_test=quantity(data_real_train)


    min_max_scaler = preprocessing.MinMaxScaler()
    age_scale_param=min_max_scaler.fit(data_test['Age'].reshape(-1, 1))
    data_test['Age_scaler'] = min_max_scaler.fit_transform(data_test['Age'].reshape(-1, 1),age_scale_param)
    #print(data_train['Age_scaler'] )


    fare_scale_param = min_max_scaler.fit(data_test['Fare'].reshape(-1,1))
    data_test['Fare_scaler'] = min_max_scaler.fit_transform(data_test['Fare'].reshape(-1,1), fare_scale_param)




    return data_test
def train_datas(rfr):
    data_train = pd.read_csv('Train.csv')
    data_train = quantity(data_train)
    df = preprocessions(data_train)


def get(clf):
    return (clf)
if __name__=='__main__':
    data_train = pd.read_csv('Train.csv')
    data_train.loc[(data_train.Age<=12),'Age']=1#将年龄小于12的置为1
    data_train.loc[(data_train.Age>12), 'Age']=0#将年龄大于12的置为0
    data_train,rfr=hd.set_miss_age(data_train)#add age is null
    data_train=hd.set_Cabin(data_train)#get new data
    data_train= quantity(data_train)#quantity parameters
    df = preprocessions(data_train)#preprocessions age and fare rank in(0,1)
    df.to_csv('new.csv',index=False)


    Bagging_clf, train_df= buildmodle(df)
    test_df=test_datas(rfr)#rfr use to add null ages
    #print(test_df)
    test=test_df.filter(regex='Survived|Age_scaler|SibSp|Parch|Fare_scaler*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')
    prediction=Bagging_clf.predict(test)
    result=pd.DataFrame({'PassengerId':test_df['PassengerId'].as_matrix(),'Survived':prediction.astype(np.int32)})
    result.to_csv('prediction.csv',index=False)
    #handle test data
    joblib.dump(Bagging_clf, "train_model.m")
    #print(pd.DataFrame({'columns':list(train_df.columns)[1:],'cofe':list(Bagging_clf.coef_.T)}))

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值