泰坦尼克号
https://www.kaggle.com/arthurtok/introduction-to-ensembling-stacking-in-python
代码理解
1. Preprocessing
1.库加载
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import warnings
warnings.filterwarnings('ignore')
import xgboost as xgb
Going to use these 5 base models for the stacking
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier,
GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.cross_validation import KFold
-
对于feature中含有缺失值(缺失值比例较大时),把缺失的和未缺失的来看
train['Has_Cabin'] = train["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
-
分桶处理
train['CategoricalFare'] = pd.qcut(train['Fare'], 4) dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0 dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1 dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2 dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3 dataset['Fare'] = dataset['Fare'].astype(int)
-
填充缺失值(通过均值和标准差来填充)
for dataset in full_data: age_avg = dataset['Age'].mean() age_std = dataset['Age'].std() age_null_count = dataset['Age'].isnull().sum() age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count) dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list dataset['Age'] = dataset['Age'].astype(int)
-
替换
dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
-
编码mapping
dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
-
热力图(去除冗余变量)
colormap = plt.cm.RdBu plt.figure(figsize=(14,12)) plt.title('Pearson Correlation of Features', y=1.05, size=15) sns.heatmap(train.astype(float).corr(),linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True)
2. Ensembling & Stacking models
# Some useful parameters which will come in handy later on
ntrain = train.shape[0]
ntest = test.shape[0]
SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)
# Class to extend the Sklearn classifier
class SklearnHelper(object):
def __init__(self, clf, seed=0, params=None):
params['random_state'] = seed
self.clf = clf(**params)
def train(self, x_train, y_train):
self.clf.fit(x_train, y_train)
def predict(self, x):
return self.clf.predict(x)
def fit(self,x,y):
return self.clf.fit(x,y)
def feature_importa