机器学习-模型堆积(一)

import pandas as pd  
import numpy as np  
from scipy.stats import skew  
import xgboost as xgb  
from sklearn.cross_validation import KFold  
from sklearn.ensemble import ExtraTreesRegressor  
from sklearn.ensemble import RandomForestRegressor  
from sklearn.metrics import mean_squared_error  
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, Lasso  
from math import sqrt  
  
  
TARGET = 'SalePrice'  
NFOLDS = 5  
SEED = 0  
NROWS = None  
SUBMISSION_FILE = '../input/sample_submission.csv'  
  
  
## Load the data ##  
train = pd.read_csv("../input/train.csv")  
test = pd.read_csv("../input/test.csv")  
  
ntrain = train.shape[0]  
ntest = test.shape[0]  
  
## Preprocessing ##  
  
y_train = np.log(train[TARGET]+1)  
  
  
train.drop([TARGET], axis=1, inplace=True)  
  
  
all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],  
                      test.loc[:,'MSSubClass':'SaleCondition']))  
  
  
#log transform skewed numeric features:  
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index  
  
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness  
skewed_feats = skewed_feats[skewed_feats > 0.75]  
skewed_feats = skewed_feats.index  
  
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])  
  
all_data = pd.get_dummies(all_data)  
  
#filling NA's with the mean of the column:  
all_data = all_data.fillna(all_data.mean())  
  
#creating matrices for sklearn:  
  
x_train = np.array(all_data[:train.shape[0]])  
x_test = np.array(all_data[train.shape[0]:])  
  
kf = KFold(ntrain, n_folds=NFOLDS, shuffle=True, random_state=SEED)  
  
  
class SklearnWrapper(object):  
    def __init__(self, clf, seed=0, params=None):  
        params['random_state'] = seed  
        self.clf = clf(**params)  
  
    def train(self, x_train, y_train):  
        self.clf.fit(x_train, y_train)  
  
    def predict(self, x):  
        return self.clf.predict(x)  
  
  
class XgbWrapper(object):  
    def __init__(self, seed=0, params=None):  
        self.param = params  
        self.param['seed'] = seed  
        self.nrounds = params.pop('nrounds', 250)  
  
    def train(self, x_train, y_train):  
        dtrain = xgb.DMatrix(x_train, label=y_train)  
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)  
  
    def predict(self, x):  
        return self.gbdt.predict(xgb.DMatrix(x))  
  
  
def get_oof(clf):  
    oof_train = np.zeros((ntrain,))  
    oof_test = np.zeros((ntest,))  
    oof_test_skf = np.empty((NFOLDS, ntest))  
  
    for i, (train_index, test_index) in enumerate(kf):  
        x_tr = x_train[train_index]  
        y_tr = y_train[train_index]  
        x_te = x_train[test_index]  
  
        clf.train(x_tr, y_tr)  
  
        oof_train[test_index] = clf.predict(x_te)  
        oof_test_skf[i, :] = clf.predict(x_test)  
  
    oof_test[:] = oof_test_skf.mean(axis=0)  
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)  
  
  
et_params = {  
    'n_jobs': 16,  
    'n_estimators': 100,  
    'max_features': 0.5,  
    'max_depth': 12,  
    'min_samples_leaf': 2,  
}  
  
rf_params = {  
    'n_jobs': 16,  
    'n_estimators': 100,  
    'max_features': 0.2,  
    'max_depth': 12,  
    'min_samples_leaf': 2,  
}  
  
xgb_params = {  
    'seed': 0,  
    'colsample_bytree': 0.7,  
    'silent': 1,  
    'subsample': 0.7,  
    'learning_rate': 0.075,  
    'objective': 'reg:linear',  
    'max_depth': 4,  
    'num_parallel_tree': 1,  
    'min_child_weight': 1,  
    'eval_metric': 'rmse',  
    'nrounds': 500  
}  
  
  
  
rd_params={  
    'alpha': 10  
}  
  
  
ls_params={  
    'alpha': 0.005  
}  
  
  
xg = XgbWrapper(seed=SEED, params=xgb_params)  
et = SklearnWrapper(clf=ExtraTreesRegressor, seed=SEED, params=et_params)  
rf = SklearnWrapper(clf=RandomForestRegressor, seed=SEED, params=rf_params)  
rd = SklearnWrapper(clf=Ridge, seed=SEED, params=rd_params)  
ls = SklearnWrapper(clf=Lasso, seed=SEED, params=ls_params)  
  
xg_oof_train, xg_oof_test = get_oof(xg)  
et_oof_train, et_oof_test = get_oof(et)  
rf_oof_train, rf_oof_test = get_oof(rf)  
rd_oof_train, rd_oof_test = get_oof(rd)  
ls_oof_train, ls_oof_test = get_oof(ls)  
  
print("XG-CV: {}".format(sqrt(mean_squared_error(y_train, xg_oof_train))))  
print("ET-CV: {}".format(sqrt(mean_squared_error(y_train, et_oof_train))))  
print("RF-CV: {}".format(sqrt(mean_squared_error(y_train, rf_oof_train))))  
print("RD-CV: {}".format(sqrt(mean_squared_error(y_train, rd_oof_train))))  
print("LS-CV: {}".format(sqrt(mean_squared_error(y_train, ls_oof_train))))  
  
  
x_train = np.concatenate((xg_oof_train, et_oof_train, rf_oof_train, rd_oof_train, ls_oof_train), axis=1)  
x_test = np.concatenate((xg_oof_test, et_oof_test, rf_oof_test, rd_oof_test, ls_oof_test), axis=1)  
  
print("{},{}".format(x_train.shape, x_test.shape))  
  
dtrain = xgb.DMatrix(x_train, label=y_train)  
dtest = xgb.DMatrix(x_test)  
  
xgb_params = {  
    'seed': 0,  
    'colsample_bytree': 0.8,  
    'silent': 1,  
    'subsample': 0.6,  
    'learning_rate': 0.01,  
    'objective': 'reg:linear',  
    'max_depth': 1,  
    'num_parallel_tree': 1,  
    'min_child_weight': 1,  
    'eval_metric': 'rmse',  
}  
  
res = xgb.cv(xgb_params, dtrain, num_boost_round=1000, nfold=4, seed=SEED, stratified=False,  
             early_stopping_rounds=25, verbose_eval=10, show_stdv=True)  
  
best_nrounds = res.shape[0] - 1  
cv_mean = res.iloc[-1, 0]  
cv_std = res.iloc[-1, 1]  
  
print('Ensemble-CV: {0}+{1}'.format(cv_mean, cv_std))  
  
gbdt = xgb.train(xgb_params, dtrain, best_nrounds)  
  
submission = pd.read_csv(SUBMISSION_FILE)  
submission.iloc[:, 1] = gbdt.predict(dtest)  
saleprice = np.exp(submission['SalePrice'])-1  
submission['SalePrice'] = saleprice  
submission.to_csv('xgstacker_starter.sub.csv', index=None)  
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值