机器学习预测房价 (2)

参考这里

No module named ‘sklearn.cross_validation’

修改为
from sklearn.model_selection import KFold

ModuleNotFoundError: No module named ‘sklearn.grid_search’

from sklearn.model_selection import GridSearchCV

import numpy as np
import pandas as pd
import datetime
# from sklearn.cross_validation import KFold
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import time
from sklearn import preprocessing
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
# from sklearn.grid_search import GridSearchCV
from sklearn.model_selection  import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.linear_model import Ridge, LassoCV, LassoLarsCV, ElasticNet
from sklearn.kernel_ridge import KernelRidge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from scipy.stats import skew


def create_submission(prediction, score):
    now = datetime.datetime.now()
    sub_file = 'submission_' + str(score) + '_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
    # sub_file = 'prediction_training.csv'
    print('Creating submission: ', sub_file)
    pd.DataFrame({'Id': test['Id'].values, 'SalePrice': prediction}).to_csv(sub_file, index=False)


# train need to be test when do test prediction
def data_preprocess(train, test):
    # 删除之前分析的异常数据样本
    # # outlier_idx = [4,11,13,20,46,66,70,167,178,185,199,
    #                224,261, 309,313,318, 349,412,423,440,454,477,478, 523,540,
    #                581,588,595,654,688, 691, 774, 798, 875, 898,926,970,987,1027,1109,
    #                1169,1182,1239, 1256,1298,1324,1353,1359,1405,1442,1447]
    # # train.drop(train.index[outlier_idx],inplace=True)
    train = train.drop(train[(train['GrLivArea'] > 4000)].index)  # 根据参考文献说的该属性有明显的异常值
    all_data = pd.concat((train.loc[:, 'MSSubClass':'SaleCondition'], test.loc[:, 'MSSubClass':'SaleCondition']))
    # 删除之前分析的缺失值超过三分之1 的属性
    to_delete = ['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']
    all_data = all_data.drop(to_delete, axis=1)

    # 属性特征的偏态大于0.75的进行log化处理
    train["SalePrice"] = np.log1p(train["SalePrice"])
    # log transform skewed numeric features
    numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))  # compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
    # 对分类特征进行one-hot哑编码 转化成数值型数据
    all_data = pd.get_dummies(all_data)
    # 剩下的缺失数据用平均值来代替
    all_data = all_data.fillna(all_data.mean())

    X_train = all_data[:train.shape[0]]
    X_test = all_data[train.shape[0]:]
    y = train.SalePrice

    return X_train, X_test, y


def mean_squared_error_(ground_truth, predictions):
    return mean_squared_error(ground_truth, predictions) ** 0.5


RMSE = make_scorer(mean_squared_error_, greater_is_better=False)


class ensemble(object):
    def __init__(self, n_folds, stacker, base_models):
        self.n_folds = n_folds
        self.stacker = stacker
        self.base_models = base_models

    def fit_predict(self, train, test, ytr):
        X = train.values
        y = ytr.values
        T = test.values
        kf = KFold(n_splits=5, random_state=42, shuffle=True)
        folds = list(kf.split(y))
        # folds = list(KFold(n_splits=5, random_state=42, shuffle=True))
        # folds = list(KFold(len(y), n_folds=self.n_folds, shuffle=True, random_state=0))
        # folds = list(KFold(n_folds=self.n_folds, shuffle=True, random_state=0))
        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))  # X need to be T when do test prediction
        for i, reg in enumerate(base_models):
            print("Fitting the base model...")
            S_test_i = np.zeros((T.shape[0], len(folds)))  # X need to be T when do test prediction
            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
                reg.fit(X_train, y_train)
                y_pred = reg.predict(X_holdout)[:]
                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = reg.predict(T)[:]
            #    S_test_i[:,j] = reg.predict(X)[:]
            S_test[:, i] = S_test_i.mean(1)

        print("Stacking base models...")
        param_grid = {'alpha': [1e-3, 5e-3, 1e-2, 5e-2, 1e-1, 0.2, 0.3, 0.4, 0.5, 0.8, 1e0, 3, 5, 7, 1e1, 2e1, 5e1]}
        grid = GridSearchCV(estimator=self.stacker, param_grid=param_grid, n_jobs=1, cv=5, scoring=RMSE)
        grid.fit(S_train, y)
        try:
            print('Param grid:')
            print(param_grid)
            print('Best Params:')
            print(grid.best_params_)
            print('Best CV Score:')
            print(-grid.best_score_)
            print('Best estimator:')
            print(grid.best_estimator_)
            print(message)
        except:
            pass

        y_pred = grid.predict(S_test)[:]
        return y_pred, -grid.best_score_


if __name__ == '__main__':
    train = pd.read_csv("./input/train.csv")  # read train data
    test = pd.read_csv("./input/test.csv")  # read test data

    base_models = [
        RandomForestRegressor(
            n_jobs=1, random_state=0,
            n_estimators=500, max_features=18, max_depth=11
        ),
        ExtraTreesRegressor(
            n_jobs=1, random_state=0,
            n_estimators=500, max_features=20
        ),
        GradientBoostingRegressor(
            random_state=0,
            n_estimators=500, max_features=10, max_depth=6,
            learning_rate=0.05, subsample=0.8
        ),
        XGBRegressor(
            seed=0,
            n_estimators=500, max_depth=7,
            learning_rate=0.05, subsample=0.8, colsample_bytree=0.75
        ),
    ]

    ensem = ensemble(
        n_folds=5,
        stacker=Ridge(),
        base_models=base_models
    )

    X_train, X_test, y_train = data_preprocess(train, test)
    print(X_train.head())
    y_pred, score = ensem.fit_predict(X_train, X_test, y_train)

    create_submission(np.expm1(y_pred), score)
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

风口上的传奇

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值