参考这里:
No module named ‘sklearn.cross_validation’
修改为
from sklearn.model_selection import KFold
ModuleNotFoundError: No module named ‘sklearn.grid_search’
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd
import datetime
# from sklearn.cross_validation import KFold
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import time
from sklearn import preprocessing
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
# from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.linear_model import Ridge, LassoCV, LassoLarsCV, ElasticNet
from sklearn.kernel_ridge import KernelRidge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from scipy.stats import skew
def create_submission(prediction, score):
now = datetime.datetime.now()
sub_file = 'submission_' + str(score) + '_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
# sub_file = 'prediction_training.csv'
print('Creating submission: ', sub_file)
pd.DataFrame({'Id': test['Id'].values, 'SalePrice': prediction}).to_csv(sub_file, index=False)
# train need to be test when do test prediction
def data_preprocess(train, test):
# 删除之前分析的异常数据样本
# # outlier_idx = [4,11,13,20,46,66,70,167,178,185,199,
# 224,261, 309,313,318, 349,412,423,440,454,477,478, 523,540,
# 581,588,595,654,688, 691, 774, 798, 875, 898,926,970,987,1027,1109,
# 1169,1182,1239, 1256,1298,1324,1353,1359,1405,1442,1447]
# # train.drop(train.index[outlier_idx],inplace=True)
train = train.drop(train[(train['GrLivArea'] > 4000)].index) # 根据参考文献说的该属性有明显的异常值
all_data = pd.concat((train.loc[:, 'MSSubClass':'SaleCondition'], test.loc[:, 'MSSubClass':'SaleCondition']))
# 删除之前分析的缺失值超过三分之1 的属性
to_delete = ['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']
all_data = all_data.drop(to_delete, axis=1)
# 属性特征的偏态大于0.75的进行log化处理
train["SalePrice"] = np.log1p(train["SalePrice"])
# log transform skewed numeric features
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) # compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
# 对分类特征进行one-hot哑编码 转化成数值型数据
all_data = pd.get_dummies(all_data)
# 剩下的缺失数据用平均值来代替
all_data = all_data.fillna(all_data.mean())
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice
return X_train, X_test, y
def mean_squared_error_(ground_truth, predictions):
return mean_squared_error(ground_truth, predictions) ** 0.5
RMSE = make_scorer(mean_squared_error_, greater_is_better=False)
class ensemble(object):
def __init__(self, n_folds, stacker, base_models):
self.n_folds = n_folds
self.stacker = stacker
self.base_models = base_models
def fit_predict(self, train, test, ytr):
X = train.values
y = ytr.values
T = test.values
kf = KFold(n_splits=5, random_state=42, shuffle=True)
folds = list(kf.split(y))
# folds = list(KFold(n_splits=5, random_state=42, shuffle=True))
# folds = list(KFold(len(y), n_folds=self.n_folds, shuffle=True, random_state=0))
# folds = list(KFold(n_folds=self.n_folds, shuffle=True, random_state=0))
S_train = np.zeros((X.shape[0], len(self.base_models)))
S_test = np.zeros((T.shape[0], len(self.base_models))) # X need to be T when do test prediction
for i, reg in enumerate(base_models):
print("Fitting the base model...")
S_test_i = np.zeros((T.shape[0], len(folds))) # X need to be T when do test prediction
for j, (train_idx, test_idx) in enumerate(folds):
X_train = X[train_idx]
y_train = y[train_idx]
X_holdout = X[test_idx]
reg.fit(X_train, y_train)
y_pred = reg.predict(X_holdout)[:]
S_train[test_idx, i] = y_pred
S_test_i[:, j] = reg.predict(T)[:]
# S_test_i[:,j] = reg.predict(X)[:]
S_test[:, i] = S_test_i.mean(1)
print("Stacking base models...")
param_grid = {'alpha': [1e-3, 5e-3, 1e-2, 5e-2, 1e-1, 0.2, 0.3, 0.4, 0.5, 0.8, 1e0, 3, 5, 7, 1e1, 2e1, 5e1]}
grid = GridSearchCV(estimator=self.stacker, param_grid=param_grid, n_jobs=1, cv=5, scoring=RMSE)
grid.fit(S_train, y)
try:
print('Param grid:')
print(param_grid)
print('Best Params:')
print(grid.best_params_)
print('Best CV Score:')
print(-grid.best_score_)
print('Best estimator:')
print(grid.best_estimator_)
print(message)
except:
pass
y_pred = grid.predict(S_test)[:]
return y_pred, -grid.best_score_
if __name__ == '__main__':
train = pd.read_csv("./input/train.csv") # read train data
test = pd.read_csv("./input/test.csv") # read test data
base_models = [
RandomForestRegressor(
n_jobs=1, random_state=0,
n_estimators=500, max_features=18, max_depth=11
),
ExtraTreesRegressor(
n_jobs=1, random_state=0,
n_estimators=500, max_features=20
),
GradientBoostingRegressor(
random_state=0,
n_estimators=500, max_features=10, max_depth=6,
learning_rate=0.05, subsample=0.8
),
XGBRegressor(
seed=0,
n_estimators=500, max_depth=7,
learning_rate=0.05, subsample=0.8, colsample_bytree=0.75
),
]
ensem = ensemble(
n_folds=5,
stacker=Ridge(),
base_models=base_models
)
X_train, X_test, y_train = data_preprocess(train, test)
print(X_train.head())
y_pred, score = ensem.fit_predict(X_train, X_test, y_train)
create_submission(np.expm1(y_pred), score)