kaggle预测实例1

import os, warnings
import numpy as np, pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from hyperopt import hp, fmin, tpe, space_eval, Trials, STATUS_OK # 贝叶斯优化
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
C_Random_Seed = 22
os.environ["HYPEROPT_FMIN_SEED"] = f'{C_Random_Seed}'
warnings.filterwarnings('ignore')

C_Selected_Model = 'xgb'
C_Debug = False

C_Tunable_Params = {
    'rnf': {'max_depth': hp.choice('rnf.max_depth', np.arange(2, 5, dtype = int)),
            'n_estimators': hp.choice('rnf.n_estimators', np.arange(50, 400, dtype = int)),          
           },

    'xgb': {'max_depth': hp.choice('xgb.max_depth', np.arange(2, 5, dtype = int)),
            'learning_rate': hp.quniform('xgb.learning_rate', 0.01, 0.05, 0.01),
            'n_estimators': hp.choice('xgb.n_estimators', np.arange(50, 400, dtype = int)),
            'subsample': hp.quniform('xgb.subsample', 0.1, 1.0, 0.1),
            'gamma': hp.quniform('xgb.gamma', 0.0, 0.5, 0.1),
            'min_child_weight': hp.quniform('xgb.min_child_weight', 1, 10, 1),          
           },
    
    'gbc': {'loss': hp.choice('gbc.loss', ['log_loss', 'exponential']),
            'max_depth': hp.choice('gbc.max_depth', np.arange(2, 5, dtype = int)),
            'learning_rate': hp.quniform('gbc.learning_rate', 0.05, 0.4, 0.01),
            'n_estimators': hp.choice('gbc.n_estimators', np.arange(50, 400, dtype = int)),
            'subsample': hp.quniform('gbc.subsample', 0.1, 1.0, 0.1),
            'criterion': hp.choice('gbc.criterion', ['friedman_mse', 'mse', 'squared_error']),
            },

    'svc': {'C': hp.quniform('svc.C', 0.1, 1.0, 0.1),
            'kernel': hp.choice('svc.kernel', ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']),
            'degree': hp.choice('svc.degree', np.arange(2, 4, dtype = int)),
            'gamma': hp.choice('svc.gamma', ['scale', 'auto']),
            },    
}            
# 开始
def get_data():
    """
    1. Getting the data from the input path
    2. Split the training and test data into features and targets
    """
    train_df = pd.read_csv('../input/spaceship-titanic/train.csv')
    test_df = pd.read_csv('../input/spaceship-titanic/test.csv')
    X_train = train_df.drop(['Transported'], axis = 1)
    y_train = train_df['Transported']
    X_test = test_df.copy()
    return X_train, y_train, X_test


def preprocess(df):
    """
    1. Handle missing values
      a. Cabin: Fill a dummy value in the given format: Deck/Num/Side
      b. Age: Assume people whose age is missing as adults
      c. Side: Assume missing values are Port
      d. CryoSleep: People will need to spend money (key - FoodCourt), if they are not on CryoSleep.
      
    2. Encode categorical variables
      a. Ordinal Encoding:
          - Deck : Since people at lower decks have a lesser chance of escaping. ABCDEFGT Bottom-Up.
      b. One Hot Encoding:
          - Side (Port / Starboard)
          - Age (Child / Adult)
          - CryoSleep (True / False)
          - VIP (True / False)
          - HomePlanet
          - Destination
          
    3. Feature Engineering
      a. Regular, Luxury and Total Spends.
      b. Remove columns that do not provide any useful information.
    """
    
    df['Age'].fillna(19, inplace = True)
    df['Age'] = df['Age'].apply(lambda x: 1 if x <= 18 else 0)
    
    df['Cabin'].fillna('0/0/0', inplace = True)
    df['Deck'] = df['Cabin'].apply(lambda x: str(x).split('/')[0])
    df['Deck'] = df['Deck'].apply(lambda x: '0ABCDEFGT'.index(x))
    df['Side'] = df['Cabin'].apply(lambda x: str(x).split('/')[2])
    df['Side'] = df['Side'].replace({'0': 'P'})
    
    money_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    for col in money_cols:
        df[col] = df[col].fillna(0)
    df['Regular'] =  df[['FoodCourt', 'ShoppingMall']].sum(axis = 1)  
    df['Luxury'] =  df[['RoomService', 'Spa', 'VRDeck']].sum(axis = 1)  
    df['Total_Spent'] = df[money_cols].sum(axis = 1)
    
    df.loc[(df.CryoSleep.isnull()) & (df.Total_Spent == 0), 'CryoSleep'] = True
    df.loc[(df.CryoSleep.isnull()) & (df.Total_Spent != 0), 'CryoSleep'] = False
    
    df['Id'] = df.PassengerId.str[:4]
    df['Group'] = df.Id.duplicated(keep = False).astype(int)
    
    df.drop(money_cols + ['Name', 'Cabin', 'PassengerId', 'Id'], axis = 1, inplace = True)
    df = pd.get_dummies(df, columns = ['HomePlanet','CryoSleep', 'Destination', 'VIP', 'Side'], drop_first = True)
    return df


def feature_transformations(train, test):
    """
    1. Feature Transformation of continuous features.
      a. Deck is distributed across a narrow range. MinMax Scaling would be suitable.
      b. Expenditures are distributed across a wide range. Log Transformations would be ideal.
    2. Necessary if the algorithm used is not tree based.
    """
              
    for col in ['Total_Spent', 'Regular', 'Luxury']:
        train[col] = np.log1p(train[col])
        test[col] = np.log1p(test[col])
        
    for col_ in ['Deck', 'Total_Spent', 'Regular', 'Luxury']:
        sc_X = MinMaxScaler(feature_range = (0, 1))
        train.loc[:, col_] = sc_X.fit_transform(train.loc[:, col_].values.reshape(-1, 1))
        test.loc[:, col_] = sc_X.transform(test.loc[:, col_].values.reshape(-1, 1))         
    return train, test
def get_model_instance(mod_type_, params):
    """
    Create a model instance with the provided parameters.
    """
    if mod_type_ == 'rnf':
        selected_model = RandomForestClassifier(**params, random_state = C_Random_Seed)
    elif mod_type_ == 'xgb':
        selected_model = XGBClassifier(**params, random_state = C_Random_Seed)
    elif mod_type_ == 'gbc':
        selected_model = GradientBoostingClassifier(**params, random_state = C_Random_Seed)           
    return selected_model


def fine_tune_model(X_train, y_train, mod_type_):
    """
    Tune the hyperparameters for the model selected.
    """
    def objective(params):
        model = get_model_instance(mod_type_, params)
        loss_metric = -1 * cross_val_score(model, X_train, y_train, cv = 10, scoring = 'neg_mean_absolute_error')
        print(mod_type_, np.mean(loss_metric), STATUS_OK)
        return {'loss': np.mean(loss_metric), 'loss_on_folds': loss_metric, 'status': STATUS_OK}

    fmin_trials = Trials()
    search_space = hp.choice('model_type', [C_Tunable_Params[mod_type_]])
    # fmin参数: fn, spacem algo, trials, max_evals,show_progressbar, verbose, rstate
    best_params = fmin(fn = objective, space = search_space, algo = tpe.suggest, trials = fmin_trials, max_evals = 100, 
                       show_progressbar = False, verbose = False, rstate = np.random.default_rng(C_Random_Seed))    
    best = fmin_trials.best_trial['result']
    best['params'] = space_eval(search_space, best_params)
    best['type'] = mod_type_
    return best    
    
    
def model_selection_01(X_train, y_train, X_test):
    """
    1. Select the appropriate model and tune the hyperparameters
    2. Return feature importances and predictions
    
    """
    if C_Selected_Model:
        model_pool = [C_Selected_Model]
    else:
        model_pool = list(C_Tunable_Params.keys()) 
    # model_pool = ['xgb']
                       
    model_summary_list = []    
    for mod_type in model_pool:
        # mod_type = 'xgb'
        # 开始迭代输出, max_eval = 100
        # xgb 0.19325278097429996 ok
        # 这里我们只迭代了'xgb',所以我们只能有一个best
        best = fine_tune_model(X_train, y_train, mod_type)
        model_summary_list.append(best)
        # model_summary_list = ['best_xgb_model']
    
    
    model_summary_df = pd.DataFrame(model_summary_list)
    print('model_summary_df:', model_summary_df)

    best_model_summary = model_summary_df.iloc[model_summary_df.loss.argmin()]
    print('best_model_summary:', best_model_summary)
    best_model = get_model_instance(best_model_summary['type'], best_model_summary['params'])
    print('best_model:', best_model)

    best_model.fit(X_train, y_train)
    predictions = best_model.predict(X_test)
    feat_imp = list(zip(best_model.feature_names_in_, best_model.feature_importances_))
    feat_imp = sorted(feat_imp, key = lambda x: x[1], reverse = True)
    
    return predictions, feat_imp
def finalize(y_pred):
    """
    1. Get the template from the sample submission file.
    2. Add predictions to the template.
    3. Export to submission.csv
    """
    submission = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')
    submission['Transported'] = y_pred
    submission['Transported'] = submission['Transported'].replace({1: 'True', 0: 'False'})
    submission.to_csv('./submission.csv', index = False)
print('Getting the data..')
X_train, y_train, X_test = get_data()

print('Preprocessing..')
X_train = preprocess(X_train)
X_test = preprocess(X_test)
X_train, X_test = feature_transformations(X_train, X_test)

if C_Debug:
    pass 
else:
    print('Model training..')
    y_pred, feature_importances = model_selection_01(X_train, y_train, X_test)
    print('Finalizing..')
    finalize(y_pred)
Getting the data..
Preprocessing..
Model training..
xgb 0.1967047603931061 ok
xgb 0.1960136502519741 ok
xgb 0.20740433580678017 ok
xgb 0.20901432482837984 ok
xgb 0.19670489266298954 ok
xgb 0.20521778236313376 ok
xgb 0.19797031863814926 ok
xgb 0.2074036744573628 ok
xgb 0.20349271854291495 ok
xgb 0.20705924368080636 ok
xgb 0.197049587979313 ok
xgb 0.19969538245836804 ok
xgb 0.20636786899990742 ok
xgb 0.2014209753581207 ok
xgb 0.19877491633929872 ok
xgb 0.20199595254156583 ok
xgb 0.2065980185971456 ok
xgb 0.20372260360038622 ok
xgb 0.20590836342473184 ok
xgb 0.19912000846527253 ok
xgb 0.1987749163392987 ok
xgb 0.20153671150615718 ok
xgb 0.20176765472269617 ok
xgb 0.20222649894845443 ok
xgb 0.20682856500403418 ok
xgb 0.19739573826435458 ok
xgb 0.20602343822335095 ok
xgb 0.1988898588680343 ok
xgb 0.20590823115484835 ok
xgb 0.19808512889700144 ok
xgb 0.19785445022022938 ok
xgb 0.2061382484822031 ok
xgb 0.20556340356864147 ok
xgb 0.19509318413290477 ok
xgb 0.19762535878205895 ok
xgb 0.20429771305371483 ok
xgb 0.1975097549039059 ok
xgb 0.19670370223403832 ok
xgb 0.2023424996362578 ok
xgb 0.19601470841104188 ok
xgb 0.19796965728873195 ok
xgb 0.19750988717378942 ok
xgb 0.19716426596828168 ok
xgb 0.19739560599447115 ok
xgb 0.20038688940915045 ok
xgb 0.2056791397166779 ok
xgb 0.20360779334153406 ok
xgb 0.19509291959313785 ok
xgb 0.19601325344232373 ok
xgb 0.20176619975397805 ok
xgb 0.19509318413290477 ok
xgb 0.1977395076914937 ok
xgb 0.19325278097429996 ok
xgb 0.19486224091636578 ok
xgb 0.19969498564871763 ok
xgb 0.2011917516500668 ok
xgb 0.19601338571220722 ok
xgb 0.19796992182849887 ok
xgb 0.1998097959075698 ok
xgb 0.20084718860362685 ok
xgb 0.19762430062299116 ok
xgb 0.19727920849701733 ok
xgb 0.20061584857743742 ok
xgb 0.19831501395447268 ok
xgb 0.19750962263402247 ok
xgb 0.19716334007909736 ok
xgb 0.19474769519728055 ok
xgb 0.19635913918759837 ok
xgb 0.19612912186024362 ok
xgb 0.19279208497017314 ok
xgb 0.19336745896326865 ok
xgb 0.2013072232583363 ok
xgb 0.19325225189476608 ok
xgb 0.19900533047630387 ok
xgb 0.19831593984365697 ok
xgb 0.19854410539264317 ok
xgb 0.19762482970252504 ok
xgb 0.20429744851394785 ok
xgb 0.1983151462243562 ok
xgb 0.19601325344232373 ok
xgb 0.20119069349099905 ok
xgb 0.19935134849146197 ok
xgb 0.19727986984643467 ok
xgb 0.2033769823948785 ok
xgb 0.19728039892596855 ok
xgb 0.20142256259672237 ok
xgb 0.20130576828961816 ok
xgb 0.19797058317791622 ok
xgb 0.1961279314312924 ok
xgb 0.20337751147441238 ok
xgb 0.1985459571710117 ok
xgb 0.19647408171633401 ok
xgb 0.2056782138274936 ok
xgb 0.19854595717101176 ok
xgb 0.19739401875586945 ok
xgb 0.19842929513379098 ok
xgb 0.19681904157242439 ok
xgb 0.20027035964181317 ok
xgb 0.19659034694390437 ok
xgb 0.19658902424506963 ok
model_summary_list: [{'loss': 0.19279208497017314, 'loss_on_folds': array([0.1954023 , 0.22298851, 0.2183908 , 0.19102417, 0.19447641,
       0.17951669, 0.16915995, 0.17951669, 0.17836594, 0.1990794 ]), 'status': 'ok', 'params': {'gamma': 0.1, 'learning_rate': 0.04, 'max_depth': 4, 'min_child_weight': 7.0, 'n_estimators': 398, 'subsample': 0.9}, 'type': 'xgb'}]
best_model_summary: loss                                                      0.192792
loss_on_folds    [0.19540229885057472, 0.22298850574712645, 0.2...
status                                                          ok
params           {'gamma': 0.1, 'learning_rate': 0.04, 'max_dep...
type                                                           xgb
Name: 0, dtype: object
best_model: XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, gamma=0.1,
              gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.04, max_bin=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=4,
              max_leaves=None, min_child_weight=7.0, missing=nan,
              monotone_constraints=None, n_estimators=398, n_jobs=None,
              num_parallel_tree=None, predictor=None, random_state=22,
              reg_alpha=None, reg_lambda=None, ...)
Finalizing..

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值