kaggle住房预测项目——第2部分(bagging)

最新推荐文章于 2022-02-18 19:43:25 发布

BernadetteDi

最新推荐文章于 2022-02-18 19:43:25 发布

阅读量1.4k

点赞数

分类专栏： python machine learning 文章标签：机器学习 python

本文链接：https://blog.csdn.net/weixin_45004761/article/details/114845107

版权

python 同时被 2 个专栏收录

14 篇文章 1 订阅

订阅专栏

machine learning

11 篇文章 2 订阅

订阅专栏

kaggle住房预测项目——第2部分(bagging)

基线模型

import xgboost as xgb
import copy
import datetime,time
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
4

from sklearn.metrics import make_scorer
def xgb_eval(data):
    def my_error_func(y_ture, y_pred):
        error = np.sqrt(mean_squared_log_error(y_ture, y_pred))
        return error
    my_score = make_scorer(my_error_func, greater_is_better=False)
    
    start = datetime.datetime.now()
    
    train_df = copy.deepcopy(data)    
    X_train=train_df.drop(['SalePrice'],axis=1)
    y_train=train_df.loc[:,'SalePrice'].values
    
    model = xgb.XGBRegressor(
        n_jobs=-1,
        random_state=666
    )
    
    param_grid = {
        'learning_rate': [0.1],
        'n_estimators': [100]
    }
    gridsearch = GridSearchCV(
        model, 
        param_grid=param_grid, 
        scoring=my_score,
        cv=5, 
        verbose=3, 
        n_jobs=-1
    )
    gridsearch.fit(X_train, y_train)
    
    print('参数的最佳取值：{0}'.format(gridsearch.best_params_))
    print('最佳模型得分:{0}'.format(-gridsearch.best_score_))
    
    end = datetime.datetime.now()
    print('run time is:',(end-start).seconds,'秒')
    
    return -gridsearch.best_score_, gridsearch.cv_results_

数据预处理

data = pd.concat([data_train, data_test], axis=0)

data.shape

(2919, 81)

缺失值处理

missing_data(data)

	Total	Percent
PoolQC	2909	99.657417
MiscFeature	2814	96.402878
Alley	2721	93.216855
Fence	2348	80.438506
SalePrice	1459	49.982871
FireplaceQu	1420	48.646797
LotFrontage	486	16.649538
GarageQual	159	5.447071
GarageYrBlt	159	5.447071
GarageFinish	159	5.447071
GarageCond	159	5.447071
GarageType	157	5.378554
BsmtExposure	82	2.809181
BsmtCond	82	2.809181
BsmtQual	81	2.774923
BsmtFinType2	80	2.740665
BsmtFinType1	79	2.706406
MasVnrType	24	0.822199
MasVnrArea	23	0.787941
MSZoning	4	0.137033
Utilities	2	0.068517
Functional	2	0.068517
BsmtFullBath	2	0.068517
BsmtHalfBath	2	0.068517
GarageArea	1	0.034258
BsmtFinSF2	1	0.034258
Exterior1st	1	0.034258
TotalBsmtSF	1	0.034258
GarageCars	1	0.034258
BsmtUnfSF	1	0.034258
Electrical	1	0.034258
BsmtFinSF1	1	0.034258
KitchenQual	1	0.034258
SaleType	1	0.034258
Exterior2nd	1	0.034258
Street	0	0.000000
RoofMatl	0	0.000000
MSSubClass	0	0.000000
LotArea	0	0.000000
OverallCond	0	0.000000
RoofStyle	0	0.000000
YearRemodAdd	0	0.000000
YearBuilt	0	0.000000
OverallQual	0	0.000000
HouseStyle	0	0.000000
BldgType	0	0.000000
Condition2	0	0.000000
Condition1	0	0.000000
LandSlope	0	0.000000
LotShape	0	0.000000
LandContour	0	0.000000
LotConfig	0	0.000000
Neighborhood	0	0.000000
HeatingQC	0	0.000000
ExterQual	0	0.000000
TotRmsAbvGrd	0	0.000000
YrSold	0	0.000000
MoSold	0	0.000000
MiscVal	0	0.000000
PoolArea	0	0.000000
ScreenPorch	0	0.000000
3SsnPorch	0	0.000000
EnclosedPorch	0	0.000000
OpenPorchSF	0	0.000000
WoodDeckSF	0	0.000000
PavedDrive	0	0.000000
Fireplaces	0	0.000000
KitchenAbvGr	0	0.000000
ExterCond	0	0.000000
BedroomAbvGr	0	0.000000
HalfBath	0	0.000000
FullBath	0	0.000000
GrLivArea	0	0.000000
LowQualFinSF	0	0.000000
2ndFlrSF	0	0.000000
1stFlrSF	0	0.000000
CentralAir	0	0.000000
SaleCondition	0	0.000000
Heating	0	0.000000
Foundation	0	0.000000
Id	0	0.000000

直接删除处理

# 删除属性
def delete_feature(df):
    N = df.shape[0]  # 样本数
    no_nan_count = df.count().to_frame().T  # 每一维特征非缺失值的数量
    del_feature, save_feature = [], []
    for col in no_nan_count.columns.tolist():
        loss_rate = (N - no_nan_count[col].values[0])/N  # 缺失率
        # print(loss_rate)
        if loss_rate >= 0.8:  # 缺失率大于 80% 时，将这一维特征删除
            del_feature.append(col)
        else:
            save_feature.append(col)
    return del_feature, df[save_feature]

del_feature, data = delete_feature(data)
print(del_feature)
data.head()

['Alley', 'PoolQC', 'Fence', 'MiscFeature']

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	LotShape	LandContour	Utilities	LotConfig	LandSlope	Neighborhood	Condition1	Condition2	BldgType	HouseStyle	OverallQual	OverallCond	YearBuilt	YearRemodAdd	RoofStyle	RoofMatl	Exterior1st	Exterior2nd	MasVnrType	MasVnrArea	ExterQual	ExterCond	Foundation	BsmtQual	BsmtCond	BsmtExposure	BsmtFinType1	BsmtFinSF1	BsmtFinType2	BsmtUnfSF	TotalBsmtSF	Heating	HeatingQC	CentralAir	Electrical	1stFlrSF	2ndFlrSF	GrLivArea	BsmtFullBath	BsmtHalfBath	FullBath	HalfBath	BedroomAbvGr	KitchenAbvGr	KitchenQual	TotRmsAbvGrd	Functional	Fireplaces	FireplaceQu	GarageType	GarageYrBlt	GarageFinish	GarageCars	GarageArea	GarageQual	GarageCond	PavedDrive	WoodDeckSF	OpenPorchSF	EnclosedPorch	MoSold	YrSold	SaleType	SaleCondition	SalePrice
0	1	60	RL	65.0	8450	Pave	Reg	Lvl	AllPub	Inside	Gtl	CollgCr	Norm	Norm	1Fam	2Story	7	5	2003	2003	Gable	CompShg	VinylSd	VinylSd	BrkFace	196.0	Gd	TA	PConc	Gd	TA	No	GLQ	706.0	Unf	150.0	856.0	GasA	Ex	Y	SBrkr	856	854	1710	1.0	0.0	2	1	3	1	Gd	8	Typ	0	NaN	Attchd	2003.0	RFn	2.0	548.0	TA	TA	Y	0	61	0	2	2008	WD	Normal	208500.0
1	2	20	RL	80.0	9600	Pave	Reg	Lvl	AllPub	FR2	Gtl	Veenker	Feedr	Norm	1Fam	1Story	6	8	1976	1976	Gable	CompShg	MetalSd	MetalSd	None	0.0	TA	TA	CBlock	Gd	TA	Gd	ALQ	978.0	Unf	284.0	1262.0	GasA	Ex	Y	SBrkr	1262	0	1262	0.0	1.0	2	0	3	1	TA	6	Typ	1	TA	Attchd	1976.0	RFn	2.0	460.0	TA	TA	Y	298	0	0	5	2007	WD	Normal	181500.0
2	3	60	RL	68.0	11250	Pave	IR1	Lvl	AllPub	Inside	Gtl	CollgCr	Norm	Norm	1Fam	2Story	7	5	2001	2002	Gable	CompShg	VinylSd	VinylSd	BrkFace	162.0	Gd	TA	PConc	Gd	TA	Mn	GLQ	486.0	Unf	434.0	920.0	GasA	Ex	Y	SBrkr	920	866	1786	1.0	0.0	2	1	3	1	Gd	6	Typ	1	TA	Attchd	2001.0	RFn	2.0	608.0	TA	TA	Y	0	42	0	9	2008	WD	Normal	223500.0
3	4	70	RL	60.0	9550	Pave	IR1	Lvl	AllPub	Corner	Gtl	Crawfor	Norm	Norm	1Fam	2Story	7	5	1915	1970	Gable	CompShg	Wd Sdng	Wd Shng	None	0.0	TA	TA	BrkTil	TA	Gd	No	ALQ	216.0	Unf	540.0	756.0	GasA	Gd	Y	SBrkr	961	756	1717	1.0	0.0	1	0	3	1	Gd	7	Typ	1	Gd	Detchd	1998.0	Unf	3.0	642.0	TA	TA	Y	0	35	272	2	2006	WD	Abnorml	140000.0
4	5	60	RL	84.0	14260	Pave	IR1	Lvl	AllPub	FR2	Gtl	NoRidge	Norm	Norm	1Fam	2Story	8	5	2000	2000	Gable	CompShg	VinylSd	VinylSd	BrkFace	350.0	Gd	TA	PConc	Gd	TA	Av	GLQ	655.0	Unf	490.0	1145.0	GasA	Ex	Y	SBrkr	1145	1053	2198	1.0	0.0	2	1	4	1	Gd	9	Typ	1	TA	Attchd	2000.0	RFn	3.0	836.0	TA	TA	Y	192	84	0	12	2008	WD	Normal	250000.0

类别数据处理

序号编码

通常用来处理类别间具有大小关系的数据，比如成绩(高中低)

独热编码

通常用于处理类别间不具有大小关系的特征，比如血型(A型血、B型血、AB型血、O型血)

提示

(1)在独热编码下，特征向量只有某一维取值为1，其余值均为0，因此可以利用向量的稀疏来节省空间
(2)如果类别型的唯一类别元素较多，可能会造成维度灾难，因此需要利用特征选择来降低维度。

import copy
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
def data_class_processing(cls, data, columns):
    for column in columns:
        if cls == 'ohe':
            ohe_data = pd.get_dummies(data[column], prefix=column)
            data.drop(column, axis=1, inplace=True)
            data = pd.concat([ohe_data, data], axis=1)
        if cls == 'label':
            le = LabelEncoder()
            data[column] = le.fit_transform(data[column])
    return data


columns = [
    'MSSubClass', 'MSZoning', 'Street', 'LotShape', 'LandContour',
    'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
    'Condition2', 'BldgType', 'HouseStyle', 
    'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
    'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
    'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC',
    'Electrical', 
    'KitchenQual', 
    'Functional', 'FireplaceQu', 'GarageType',
    'GarageFinish', 'GarageQual', 'GarageCond',
    'PavedDrive', 
    'YrSold', 'SaleType', 'SaleCondition'        
]
data = data_class_processing('ohe',data, columns)

# CentralAir	
data['CentralAir'] = data['CentralAir'].map(lambda x: 1 if x == 'Y' else 0)

data.shape

(2919, 295)

data.head()

	SaleCondition_Abnorml	SaleCondition_Normal	SaleType_WD	YrSold_2006	YrSold_2007	YrSold_2008	PavedDrive_Y	GarageCond_TA	GarageQual_TA	GarageFinish_RFn	GarageFinish_Unf	GarageType_Attchd	GarageType_Detchd	FireplaceQu_Gd	FireplaceQu_TA	Functional_Typ	KitchenQual_Gd	KitchenQual_TA	Electrical_SBrkr	HeatingQC_Ex	HeatingQC_Gd	Heating_GasA	BsmtFinType2_Unf	BsmtFinType1_ALQ	BsmtFinType1_GLQ	BsmtExposure_Av	BsmtExposure_Gd	BsmtExposure_Mn	BsmtExposure_No	BsmtCond_Gd	BsmtCond_TA	BsmtQual_Gd	BsmtQual_TA	Foundation_BrkTil	Foundation_CBlock	Foundation_PConc	ExterCond_TA	ExterQual_Gd	ExterQual_TA	MasVnrType_BrkFace	MasVnrType_None	Exterior2nd_MetalSd	Exterior2nd_VinylSd	Exterior2nd_Wd Shng	Exterior1st_MetalSd	Exterior1st_VinylSd	Exterior1st_Wd Sdng	RoofMatl_CompShg	RoofStyle_Gable	HouseStyle_1Story	HouseStyle_2Story	BldgType_1Fam	Condition2_Norm	Condition1_Feedr	Condition1_Norm	Neighborhood_CollgCr	Neighborhood_Crawfor	Neighborhood_NoRidge	Neighborhood_Veenker	LandSlope_Gtl	LotConfig_Corner	LotConfig_FR2	LotConfig_Inside	Utilities_AllPub	LandContour_Lvl	LotShape_IR1	LotShape_Reg	Street_Pave	MSZoning_RL	MSSubClass_20	MSSubClass_60	MSSubClass_70	Id	LotFrontage	LotArea	OverallQual	OverallCond	YearBuilt	YearRemodAdd	MasVnrArea	BsmtFinSF1	BsmtUnfSF	TotalBsmtSF	CentralAir	1stFlrSF	2ndFlrSF	GrLivArea	BsmtFullBath	BsmtHalfBath	FullBath	HalfBath	BedroomAbvGr	KitchenAbvGr	TotRmsAbvGrd	Fireplaces	GarageYrBlt	GarageCars	GarageArea	WoodDeckSF	OpenPorchSF	EnclosedPorch	MoSold	SalePrice
0	0	1	1	0	0	1	1	1	1	1	0	1	0	0	0	1	1	0	1	1	0	1	1	0	1	0	0	0	1	0	1	1	0	0	0	1	1	1	0	1	0	0	1	0	0	1	0	1	1	0	1	1	1	0	1	1	0	0	0	1	0	0	1	1	1	0	1	1	1	0	1	0	1	65.0	8450	7	5	2003	2003	196.0	706.0	150.0	856.0	1	856	854	1710	1.0	0.0	2	1	3	1	8	0	2003.0	2.0	548.0	0	61	0	2	208500.0
1	0	1	1	0	1	0	1	1	1	1	0	1	0	0	1	1	0	1	1	1	0	1	1	1	0	0	1	0	0	0	1	1	0	0	1	0	1	0	1	0	1	1	0	0	1	0	0	1	1	1	0	1	1	1	0	0	0	0	1	1	0	1	0	1	1	0	1	1	1	1	0	0	2	80.0	9600	6	8	1976	1976	0.0	978.0	284.0	1262.0	1	1262	0	1262	0.0	1.0	2	0	3	1	6	1	1976.0	2.0	460.0	298	0	0	5	181500.0
2	0	1	1	0	0	1	1	1	1	1	0	1	0	0	1	1	1	0	1	1	0	1	1	0	1	0	0	1	0	0	1	1	0	0	0	1	1	1	0	1	0	0	1	0	0	1	0	1	1	0	1	1	1	0	1	1	0	0	0	1	0	0	1	1	1	1	0	1	1	0	1	0	3	68.0	11250	7	5	2001	2002	162.0	486.0	434.0	920.0	1	920	866	1786	1.0	0.0	2	1	3	1	6	1	2001.0	2.0	608.0	0	42	0	9	223500.0
3	1	0	1	1	0	0	1	1	1	0	1	0	1	1	0	1	1	0	1	0	1	1	1	1	0	0	0	0	1	1	0	0	1	1	0	0	1	0	1	0	1	0	0	1	0	0	1	1	1	0	1	1	1	0	1	0	1	0	0	1	1	0	0	1	1	1	0	1	1	0	0	1	4	60.0	9550	7	5	1915	1970	0.0	216.0	540.0	756.0	1	961	756	1717	1.0	0.0	1	0	3	1	7	1	1998.0	3.0	642.0	0	35	272	2	140000.0
4	0	1	1	0	0	1	1	1	1	1	0	1	0	0	1	1	1	0	1	1	0	1	1	0	1	1	0	0	0	0	1	1	0	0	0	1	1	1	0	1	0	0	1	0	0	1	0	1	1	0	1	1	1	0	1	0	0	1	0	1	0	1	0	1	1	1	0	1	1	0	1	0	5	84.0	14260	8	5	2000	2000	350.0	655.0	490.0	1145.0	1	1145	1053	2198	1.0	0.0	2	1	4	1	9	1	2000.0	3.0	836.0	192	84	0	12	250000.0

缺失值处理

missing_data(data)

	Total	Percent
SalePrice	1459	49.982871
LotFrontage	486	16.649538
GarageYrBlt	159	5.447071
MasVnrArea	23	0.787941
BsmtFullBath	2	0.068517
BsmtHalfBath	2	0.068517
GarageCars	1	0.034258
BsmtFinSF2	1	0.034258
BsmtFinSF1	1	0.034258
BsmtUnfSF	1	0.034258
TotalBsmtSF	1	0.034258
GarageArea	1	0.034258
BsmtQual_Gd	0	0.000000
BsmtQual_TA	0	0.000000
Foundation_BrkTil	0	0.000000
BsmtQual_Ex	0	0.000000
BsmtCond_TA	0	0.000000
BsmtCond_Po	0	0.000000
Foundation_CBlock	0	0.000000
Foundation_PConc	0	0.000000
BsmtCond_Gd	0	0.000000
BsmtCond_Fa	0	0.000000
Foundation_Slab	0	0.000000
Foundation_Stone	0	0.000000
Foundation_Wood	0	0.000000
BsmtQual_Fa	0	0.000000
BsmtExposure_Mn	0	0.000000
BsmtExposure_No	0	0.000000
BsmtFinType2_Unf	0	0.000000
Heating_OthW	0	0.000000
Heating_Wall	0	0.000000
BsmtFinType2_ALQ	0	0.000000
BsmtFinType2_BLQ	0	0.000000
BsmtFinType2_GLQ	0	0.000000
BsmtFinType2_LwQ	0	0.000000
BsmtFinType2_Rec	0	0.000000
BsmtFinType1_ALQ	0	0.000000
ExterCond_Fa	0	0.000000
BsmtFinType1_BLQ	0	0.000000
BsmtFinType1_GLQ	0	0.000000
BsmtFinType1_LwQ	0	0.000000
BsmtFinType1_Rec	0	0.000000
BsmtFinType1_Unf	0	0.000000
BsmtExposure_Av	0	0.000000
BsmtExposure_Gd	0	0.000000
ExterCond_Ex	0	0.000000
ExterQual_Gd	0	0.000000
ExterCond_Gd	0	0.000000
Exterior2nd_Wd Shng	0	0.000000
Exterior2nd_Other	0	0.000000
Exterior2nd_Plywood	0	0.000000
Exterior2nd_Stone	0	0.000000
Exterior2nd_Stucco	0	0.000000
Exterior2nd_VinylSd	0	0.000000
Exterior2nd_Wd Sdng	0	0.000000
Exterior1st_AsbShng	0	0.000000
Exterior2nd_ImStucc	0	0.000000
Exterior1st_AsphShn	0	0.000000
Exterior1st_BrkComm	0	0.000000
Exterior1st_BrkFace	0	0.000000
Exterior1st_CBlock	0	0.000000
Exterior1st_CemntBd	0	0.000000
Exterior1st_HdBoard	0	0.000000
Exterior2nd_MetalSd	0	0.000000
Exterior2nd_HdBoard	0	0.000000
ExterCond_Po	0	0.000000
MasVnrType_BrkFace	0	0.000000
ExterCond_TA	0	0.000000
ExterQual_Ex	0	0.000000
ExterQual_Fa	0	0.000000
Heating_GasW	0	0.000000
ExterQual_TA	0	0.000000
MasVnrType_BrkCmn	0	0.000000
MasVnrType_None	0	0.000000
Exterior2nd_CmentBd	0	0.000000
MasVnrType_Stone	0	0.000000
Exterior2nd_AsbShng	0	0.000000
Exterior2nd_AsphShn	0	0.000000
Exterior2nd_Brk Cmn	0	0.000000
Exterior2nd_BrkFace	0	0.000000
Exterior2nd_CBlock	0	0.000000
Heating_Grav	0	0.000000
HeatingQC_Fa	0	0.000000
Heating_GasA	0	0.000000
GarageCond_Gd	0	0.000000
YrSold_2010	0	0.000000
PavedDrive_N	0	0.000000
PavedDrive_P	0	0.000000
PavedDrive_Y	0	0.000000
GarageCond_Ex	0	0.000000
GarageCond_Fa	0	0.000000
GarageCond_Po	0	0.000000
YrSold_2008	0	0.000000
GarageCond_TA	0	0.000000
GarageQual_Ex	0	0.000000
GarageQual_Fa	0	0.000000
GarageQual_Gd	0	0.000000
GarageQual_Po	0	0.000000
GarageQual_TA	0	0.000000
YrSold_2009	0	0.000000
YrSold_2007	0	0.000000
Heating_Floor	0	0.000000
SaleType_CWD	0	0.000000
SaleCondition_AdjLand	0	0.000000
SaleCondition_Alloca	0	0.000000
SaleCondition_Family	0	0.000000
SaleCondition_Normal	0	0.000000
SaleCondition_Partial	0	0.000000
SaleType_COD	0	0.000000
SaleType_Con	0	0.000000
YrSold_2006	0	0.000000
SaleType_ConLD	0	0.000000
SaleType_ConLI	0	0.000000
SaleType_ConLw	0	0.000000
SaleType_New	0	0.000000
SaleType_Oth	0	0.000000
SaleType_WD	0	0.000000
GarageFinish_Fin	0	0.000000
GarageFinish_RFn	0	0.000000
GarageFinish_Unf	0	0.000000
Electrical_FuseP	0	0.000000
KitchenQual_Ex	0	0.000000
KitchenQual_Fa	0	0.000000
KitchenQual_Gd	0	0.000000
KitchenQual_TA	0	0.000000
Electrical_FuseA	0	0.000000
Electrical_FuseF	0	0.000000
Electrical_Mix	0	0.000000
GarageType_2Types	0	0.000000
Electrical_SBrkr	0	0.000000
HeatingQC_Ex	0	0.000000
Exterior1st_MetalSd	0	0.000000
HeatingQC_Gd	0	0.000000
HeatingQC_Po	0	0.000000
HeatingQC_TA	0	0.000000
Functional_Typ	0	0.000000
Functional_Sev	0	0.000000
Functional_Mod	0	0.000000
Functional_Min2	0	0.000000
Functional_Min1	0	0.000000
Functional_Maj2	0	0.000000
Functional_Maj1	0	0.000000
FireplaceQu_TA	0	0.000000
FireplaceQu_Po	0	0.000000
FireplaceQu_Gd	0	0.000000
FireplaceQu_Fa	0	0.000000
FireplaceQu_Ex	0	0.000000
GarageType_Detchd	0	0.000000
GarageType_CarPort	0	0.000000
GarageType_BuiltIn	0	0.000000
GarageType_Basment	0	0.000000
GarageType_Attchd	0	0.000000
Exterior1st_ImStucc	0	0.000000
Exterior1st_WdShing	0	0.000000
Exterior1st_Plywood	0	0.000000
MSZoning_RH	0	0.000000
LotShape_IR3	0	0.000000
LotShape_Reg	0	0.000000
Street_Grvl	0	0.000000
Street_Pave	0	0.000000
MSZoning_C (all)	0	0.000000
MSZoning_FV	0	0.000000
MSZoning_RL	0	0.000000
LotShape_IR1	0	0.000000
MSZoning_RM	0	0.000000
MSSubClass_20	0	0.000000
MSSubClass_30	0	0.000000
MSSubClass_40	0	0.000000
MSSubClass_45	0	0.000000
MSSubClass_50	0	0.000000
LotShape_IR2	0	0.000000
LandContour_Lvl	0	0.000000
Neighborhood_Somerst	0	0.000000
LotConfig_CulDSac	0	0.000000
Neighborhood_Timber	0	0.000000
Neighborhood_Veenker	0	0.000000
LandSlope_Gtl	0	0.000000
LandSlope_Mod	0	0.000000
LandSlope_Sev	0	0.000000
LotConfig_Corner	0	0.000000
LotConfig_FR2	0	0.000000
LandContour_Low	0	0.000000
LotConfig_FR3	0	0.000000
LotConfig_Inside	0	0.000000
Utilities_AllPub	0	0.000000
Utilities_NoSeWa	0	0.000000
LandContour_Bnk	0	0.000000
LandContour_HLS	0	0.000000
MSSubClass_60	0	0.000000
MSSubClass_70	0	0.000000
MSSubClass_75	0	0.000000
Fireplaces	0	0.000000
GrLivArea	0	0.000000
FullBath	0	0.000000
HalfBath	0	0.000000
BedroomAbvGr	0	0.000000
KitchenAbvGr	0	0.000000
TotRmsAbvGrd	0	0.000000
WoodDeckSF	0	0.000000
MSSubClass_80	0	0.000000
OpenPorchSF	0	0.000000
EnclosedPorch	0	0.000000
3SsnPorch	0	0.000000
ScreenPorch	0	0.000000
PoolArea	0	0.000000
MiscVal	0	0.000000
LowQualFinSF	0	0.000000
2ndFlrSF	0	0.000000
1stFlrSF	0	0.000000
CentralAir	0	0.000000
YearRemodAdd	0	0.000000
YearBuilt	0	0.000000
OverallCond	0	0.000000
OverallQual	0	0.000000
LotArea	0	0.000000
Id	0	0.000000
MSSubClass_190	0	0.000000
MSSubClass_180	0	0.000000
MSSubClass_160	0	0.000000
MSSubClass_150	0	0.000000
MSSubClass_120	0	0.000000
MSSubClass_90	0	0.000000
MSSubClass_85	0	0.000000
Neighborhood_StoneBr	0	0.000000
Neighborhood_SawyerW	0	0.000000
Exterior1st_Stone	0	0.000000
HouseStyle_SFoyer	0	0.000000
HouseStyle_1.5Fin	0	0.000000
HouseStyle_1.5Unf	0	0.000000
HouseStyle_1Story	0	0.000000
HouseStyle_2.5Fin	0	0.000000
HouseStyle_2.5Unf	0	0.000000
HouseStyle_2Story	0	0.000000
HouseStyle_SLvl	0	0.000000
RoofStyle_Mansard	0	0.000000
BldgType_1Fam	0	0.000000
BldgType_2fmCon	0	0.000000
BldgType_Duplex	0	0.000000
BldgType_Twnhs	0	0.000000
BldgType_TwnhsE	0	0.000000
Condition2_Artery	0	0.000000
RoofStyle_Shed	0	0.000000
RoofStyle_Hip	0	0.000000
Neighborhood_Sawyer	0	0.000000
RoofMatl_Membran	0	0.000000
Exterior1st_Stucco	0	0.000000
Exterior1st_VinylSd	0	0.000000
Exterior1st_Wd Sdng	0	0.000000
MoSold	0	0.000000
RoofMatl_ClyTile	0	0.000000
RoofMatl_CompShg	0	0.000000
RoofMatl_Metal	0	0.000000
RoofStyle_Gambrel	0	0.000000
RoofMatl_Roll	0	0.000000
RoofMatl_Tar&Grv	0	0.000000
RoofMatl_WdShake	0	0.000000
RoofMatl_WdShngl	0	0.000000
RoofStyle_Flat	0	0.000000
RoofStyle_Gable	0	0.000000
Condition2_Feedr	0	0.000000
Condition2_Norm	0	0.000000
Condition2_PosA	0	0.000000
Neighborhood_Mitchel	0	0.000000
Neighborhood_CollgCr	0	0.000000
Neighborhood_Crawfor	0	0.000000
Neighborhood_Edwards	0	0.000000
Neighborhood_Gilbert	0	0.000000
Neighborhood_IDOTRR	0	0.000000
Neighborhood_MeadowV	0	0.000000
Neighborhood_NAmes	0	0.000000
Condition2_PosN	0	0.000000
Neighborhood_NPkVill	0	0.000000
Neighborhood_NWAmes	0	0.000000
Neighborhood_NoRidge	0	0.000000
Neighborhood_NridgHt	0	0.000000
Neighborhood_OldTown	0	0.000000
Neighborhood_SWISU	0	0.000000
Neighborhood_ClearCr	0	0.000000
Neighborhood_BrkSide	0	0.000000
Neighborhood_BrDale	0	0.000000
Neighborhood_Blueste	0	0.000000
Neighborhood_Blmngtn	0	0.000000
Condition1_RRNn	0	0.000000
Condition1_RRNe	0	0.000000
Condition1_RRAn	0	0.000000
Condition1_RRAe	0	0.000000
Condition1_PosN	0	0.000000
Condition1_PosA	0	0.000000
Condition1_Norm	0	0.000000
Condition1_Feedr	0	0.000000
Condition1_Artery	0	0.000000
Condition2_RRNn	0	0.000000
Condition2_RRAn	0	0.000000
Condition2_RRAe	0	0.000000
SaleCondition_Abnorml	0	0.000000

# 众数填充
def mode_fill(df,columns):
    for col in columns:
        if df[col].isnull().sum() > 0:  # 有缺失值就进行众数填充
            print(df[col].mode()[0])
            df[col].fillna(df[col].mode()[0], inplace=True)
    return df

columns = ['LotFrontage', 'GarageYrBlt', 'MasVnrArea','BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'BsmtFinSF2',
        'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', 'GarageArea']
data = mode_fill(data,columns)

60.0
2005.0
0.0
0.0
0.0
2.0
0.0
0.0
0.0
0.0
0.0

data.shape

(2919, 295)

data.head()

	SaleCondition_Abnorml	SaleCondition_Normal	SaleType_WD	YrSold_2006	YrSold_2007	YrSold_2008	PavedDrive_Y	GarageCond_TA	GarageQual_TA	GarageFinish_RFn	GarageFinish_Unf	GarageType_Attchd	GarageType_Detchd	FireplaceQu_Gd	FireplaceQu_TA	Functional_Typ	KitchenQual_Gd	KitchenQual_TA	Electrical_SBrkr	HeatingQC_Ex	HeatingQC_Gd	Heating_GasA	BsmtFinType2_Unf	BsmtFinType1_ALQ	BsmtFinType1_GLQ	BsmtExposure_Av	BsmtExposure_Gd	BsmtExposure_Mn	BsmtExposure_No	BsmtCond_Gd	BsmtCond_TA	BsmtQual_Gd	BsmtQual_TA	Foundation_BrkTil	Foundation_CBlock	Foundation_PConc	ExterCond_TA	ExterQual_Gd	ExterQual_TA	MasVnrType_BrkFace	MasVnrType_None	Exterior2nd_MetalSd	Exterior2nd_VinylSd	Exterior2nd_Wd Shng	Exterior1st_MetalSd	Exterior1st_VinylSd	Exterior1st_Wd Sdng	RoofMatl_CompShg	RoofStyle_Gable	HouseStyle_1Story	HouseStyle_2Story	BldgType_1Fam	Condition2_Norm	Condition1_Feedr	Condition1_Norm	Neighborhood_CollgCr	Neighborhood_Crawfor	Neighborhood_NoRidge	Neighborhood_Veenker	LandSlope_Gtl	LotConfig_Corner	LotConfig_FR2	LotConfig_Inside	Utilities_AllPub	LandContour_Lvl	LotShape_IR1	LotShape_Reg	Street_Pave	MSZoning_RL	MSSubClass_20	MSSubClass_60	MSSubClass_70	Id	LotFrontage	LotArea	OverallQual	OverallCond	YearBuilt	YearRemodAdd	MasVnrArea	BsmtFinSF1	BsmtUnfSF	TotalBsmtSF	CentralAir	1stFlrSF	2ndFlrSF	GrLivArea	BsmtFullBath	BsmtHalfBath	FullBath	HalfBath	BedroomAbvGr	KitchenAbvGr	TotRmsAbvGrd	Fireplaces	GarageYrBlt	GarageCars	GarageArea	WoodDeckSF	OpenPorchSF	EnclosedPorch	MoSold	SalePrice
0	0	1	1	0	0	1	1	1	1	1	0	1	0	0	0	1	1	0	1	1	0	1	1	0	1	0	0	0	1	0	1	1	0	0	0	1	1	1	0	1	0	0	1	0	0	1	0	1	1	0	1	1	1	0	1	1	0	0	0	1	0	0	1	1	1	0	1	1	1	0	1	0	1	65.0	8450	7	5	2003	2003	196.0	706.0	150.0	856.0	1	856	854	1710	1.0	0.0	2	1	3	1	8	0	2003.0	2.0	548.0	0	61	0	2	208500.0
1	0	1	1	0	1	0	1	1	1	1	0	1	0	0	1	1	0	1	1	1	0	1	1	1	0	0	1	0	0	0	1	1	0	0	1	0	1	0	1	0	1	1	0	0	1	0	0	1	1	1	0	1	1	1	0	0	0	0	1	1	0	1	0	1	1	0	1	1	1	1	0	0	2	80.0	9600	6	8	1976	1976	0.0	978.0	284.0	1262.0	1	1262	0	1262	0.0	1.0	2	0	3	1	6	1	1976.0	2.0	460.0	298	0	0	5	181500.0
2	0	1	1	0	0	1	1	1	1	1	0	1	0	0	1	1	1	0	1	1	0	1	1	0	1	0	0	1	0	0	1	1	0	0	0	1	1	1	0	1	0	0	1	0	0	1	0	1	1	0	1	1	1	0	1	1	0	0	0	1	0	0	1	1	1	1	0	1	1	0	1	0	3	68.0	11250	7	5	2001	2002	162.0	486.0	434.0	920.0	1	920	866	1786	1.0	0.0	2	1	3	1	6	1	2001.0	2.0	608.0	0	42	0	9	223500.0
3	1	0	1	1	0	0	1	1	1	0	1	0	1	1	0	1	1	0	1	0	1	1	1	1	0	0	0	0	1	1	0	0	1	1	0	0	1	0	1	0	1	0	0	1	0	0	1	1	1	0	1	1	1	0	1	0	1	0	0	1	1	0	0	1	1	1	0	1	1	0	0	1	4	60.0	9550	7	5	1915	1970	0.0	216.0	540.0	756.0	1	961	756	1717	1.0	0.0	1	0	3	1	7	1	1998.0	3.0	642.0	0	35	272	2	140000.0
4	0	1	1	0	0	1	1	1	1	1	0	1	0	0	1	1	1	0	1	1	0	1	1	0	1	1	0	0	0	0	1	1	0	0	0	1	1	1	0	1	0	0	1	0	0	1	0	1	1	0	1	1	1	0	1	0	0	1	0	1	0	1	0	1	1	1	0	1	1	0	1	0	5	84.0	14260	8	5	2000	2000	350.0	655.0	490.0	1145.0	1	1145	1053	2198	1.0	0.0	2	1	4	1	9	1	2000.0	3.0	836.0	192	84	0	12	250000.0

train = data[data['SalePrice'].notnull()]
test = data[data['SalePrice'].isnull()].drop(['SalePrice'],axis=1)

print(train.shape)
print(test.shape)

(1460, 295)
(1459, 294)


score,_ = xgb_eval(train) 

last_score = score
print(last_score)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    7.7s remaining:   11.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   10.3s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13210981084409507
run time is: 12 秒
0.13210981084409507

离群值查看

箱型图法

import numpy as np

def boxplot(data):
    # 下四分位数值、中位数，上四分位数值
    Q1, median, Q3 = np.percentile(data, (25, 50, 75), interpolation='midpoint')
    # 四分位距
    IQR = Q3 - Q1
    
    # 内限
    inner = [Q1-1.5*IQR, Q3+1.5*IQR]
    # 外限
    outer = [Q1-3.0*IQR, Q3+3.0*IQR]
#     print('>>>内限：', inner)
#     print('>>>外限：', outer)
    
    # 过滤掉极端异常值
#     print(len(data))
    goodData = []
    for value in data:
        if (value < outer[1]) and (value > outer[0]):
            goodData.append(value)
    print(f"异常值个数：{len(data)-len(goodData)}")
    
#     return f"异常值个数：{len(data)-len(goodData)}"

columns = [
    'LotFrontage','LotArea','MasVnrArea','BsmtFinSF1',
    'BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','1stFlrSF',
    '2ndFlrSF','LowQualFinSF','GrLivArea' ,'GarageArea',
    'WoodDeckSF','OpenPorchSF','EnclosedPorch',
    '3SsnPorch','ScreenPorch','PoolArea','MiscVal'
]
for col in columns:
    print(col)
    boxplot(train[col])

LotFrontage
异常值个数：16
LotArea
异常值个数：34
MasVnrArea
异常值个数：28
BsmtFinSF1
异常值个数：1
BsmtFinSF2
异常值个数：1460
BsmtUnfSF
异常值个数：0
TotalBsmtSF
异常值个数：5
1stFlrSF
异常值个数：3
2ndFlrSF
异常值个数：0
LowQualFinSF
异常值个数：1460
GrLivArea
异常值个数：4
GarageArea
异常值个数：3
WoodDeckSF
异常值个数：3
OpenPorchSF
异常值个数：18
EnclosedPorch
异常值个数：1460
3SsnPorch
异常值个数：1460
ScreenPorch
异常值个数：1460
PoolArea
异常值个数：1460
MiscVal
异常值个数：1460

先未作处理

无量纲化（xgboost不需要）

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler

def nondimensionalized(cls, data, columns):
    def sigmoid(df):
        x_sigmoid = []
        for item in columns:
            S = 1/(1+np.exp(-df[item]))
            x_sigmoid.append(S)
        return np.array(np.matrix(x_sigmoid).T)
    def feature_importance(df):
        x_sum_scaler = []
        for item in columns:
            S = np.sum(df[item])
            FI = df[item]/S
            x_sum_scaler.append(FI)
        return np.array(np.matrix(x_sum_scaler).T)
    
        
    if cls == 'minmax': # 区间缩放法-极差标准化
        mm = MinMaxScaler()
        data[columns] = mm.fit_transform(data.loc[:,columns])
        
    if cls == 'maxabs': # 极大值标准化
        ma = MaxAbsScaler()
        data[columns] = ma.fit_transform(data.loc[:,columns])
        
    if cls == 'zscore':
        ss = StandardScaler()
        data[columns] = ss.fit_transform(data.loc[:,columns])
    
    if cls == 'feature_importance':
        feature_importance(data)
        
    if cls == 'sigmoid':
        data[columns] = sigmoid(data)
    return data

无监督离散化之分箱法

# 日期
# 19.YearBuilt， 112
# 20.YearRemodAdd  61 
# 59.GarageYrBlt 98

# 等宽分箱
# cut将根据值本身来选择箱子均匀间隔，即每个箱子的间距都是相同的

columns = ['YearBuilt','YearRemodAdd', 'GarageYrBlt']
best_k = [0, 0, 0]
for col in columns:
    for k in range(2, 30):
        data_tmp = data.copy()
        data_tmp[col] = pd.cut(data_tmp[col],k,labels=False)

        train = data_tmp[data_tmp['SalePrice'].notnull()]
        test = data_tmp[data_tmp['SalePrice'].isnull()].drop(['SalePrice'],axis=1)
        score,_ = xgb_eval(train) 
        if score < last_score:
            last_score = score
            data = data_tmp 
            best_k[columns.index(col)] = k
print('********') 
print(best_k)
print(last_score)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.9s remaining:    7.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.6s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1325314918976797
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.8s remaining:    7.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.4s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.9s remaining:    7.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.5s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    5.2s remaining:    7.9s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.8s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.1s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.9s remaining:    7.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.8s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.8s remaining:    7.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.5s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13261864972243745
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13344455220740342
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13307043441098165
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1314590441980213
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.133694047769719
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13192230270881222
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13292568537231955
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.4s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13398849705479246
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1318105711452387
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.4s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13217414742944494
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1319704816242039
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.8s remaining:    7.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13324929266598454
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1309254483960009
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1309254483960009
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13092695074120808
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13092632356109551
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13093923391375495
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1309254483960009
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13094010918297302
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    5.0s remaining:    7.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.6s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1309254483960009
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13094073625896213
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.8s remaining:    7.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13092632356109551
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13093923391375495
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.8s remaining:    7.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1309254483960009
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1309254483960009
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1309254483960009
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1309254483960009
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13092695074120808
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1316914528478866
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1317691503724471
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13155167828308173
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13133331798520795
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.4s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13186260953000736
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13142461820170612
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13145574635885626
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1318834661627718
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1311596674312157
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13096455806523824
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13119017558334428
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13129433089210113
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13121513728294185
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13208848359797648
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13136700646533622
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.8s remaining:    7.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.4s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1316569403135553
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13120202438894077
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1318657365901918
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    5.1s remaining:    7.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.6s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13182170074446148
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13133005698117176
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.9s remaining:    7.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.5s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1314027454132637
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13164172403740695
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13155482094502888
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13269396984932624
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13210424560599787
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13141174441143938
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13141970000036757
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1319578199368307
run time is: 8 秒
********
[3, 14, 0]
0.1309254483960009

# [1, 7, 0]
# 0.13132731648769486

# [3, 14, 0]
# 0.1309254483960009

特征构造

def xgb_feature_importance_topk(data,k):
    data_tmp = data.copy()
    train = data_tmp[data_tmp['SalePrice'].notnull()]
    X=train.drop(['SalePrice'],axis=1)
    Y=train.loc[:,'SalePrice'].values

    
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=666)

    model = xgb.XGBRegressor(
        n_jobs=-1,
        random_state=666
    )
    
    param_grid = {
        'learning_rate': [0.1],
        'n_estimators': [100]
    }
    gridsearch = GridSearchCV(
        model, 
        param_grid=param_grid, 
        cv=5, 
        verbose=3, 
        n_jobs=-1
    )
    gridsearch.fit(X_train, y_train)

    model = gridsearch.best_estimator_

    '''
    feature_importances_方法以特征馈送给算法的顺序返回相对重要性数字。因此，为了获得前20名的功能，你会想从最到最不重要的功能，例如像这样进行排序：
    '''
    importances = model.feature_importances_ 
    indices = np.argsort(-importances)[:k] 
    columns = train.iloc[:,indices].columns.tolist()
    res = []
    for col in columns:
        for feat, importance in zip(train.columns, model.feature_importances_): 
            if col == feat:
                res.append((col,importance))
        
    return res

len(data.columns)

# '''
# 以获得每个功能名称的重要性，只是通过列名迭代和feature_importances在一起（它们相互映射）：
# '''
# for feat, importance in zip(train.columns, model.feature_importances_): 
#     print( 'feature: {f}, importance: {i}'.format(f=feat, i=importance) )

xgb_feature_importance_topk(data,295)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    3.3s remaining:    5.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.0s finished





[('OverallQual', 0.4092757),
 ('GrLivArea', 0.042585827),
 ('BsmtQual_Ex', 0.03662632),
 ('RoofMatl_ClyTile', 0.035097126),
 ('GarageCars', 0.025553642),
 ('CentralAir', 0.022178357),
 ('KitchenQual_TA', 0.020976413),
 ('Exterior1st_Stucco', 0.02032021),
 ('TotalBsmtSF', 0.01820988),
 ('MSSubClass_60', 0.016657786),
 ('1stFlrSF', 0.015945055),
 ('KitchenAbvGr', 0.0152769955),
 ('BsmtFinSF1', 0.014164196),
 ('Exterior2nd_Stucco', 0.0121664),
 ('2ndFlrSF', 0.011771718),
 ('TotRmsAbvGrd', 0.00974979),
 ('KitchenQual_Ex', 0.0096869),
 ('BsmtFinType1_GLQ', 0.008715735),
 ('MSZoning_RM', 0.0077215075),
 ('BsmtQual_Gd', 0.0075940914),
 ('ExterQual_Ex', 0.007450104),
 ('GarageQual_TA', 0.0070142536),
 ('Exterior1st_AsbShng', 0.006995251),
 ('Exterior2nd_Brk Cmn', 0.0065990016),
 ('LotShape_Reg', 0.0064700344),
 ('Fireplaces', 0.0063608997),
 ('KitchenQual_Gd', 0.0062793503),
 ('MSSubClass_30', 0.0060889646),
 ('GarageArea', 0.005154548),
 ('YearRemodAdd', 0.005096418),
 ('SaleType_New', 0.0050593144),
 ('GarageType_Attchd', 0.0045187445),
 ('FireplaceQu_Fa', 0.004383841),
 ('ExterQual_Fa', 0.0037600468),
 ('BsmtFinType2_LwQ', 0.003100799),
 ('LotArea', 0.0029817864),
 ('Electrical_FuseF', 0.0029267112),
 ('BldgType_Duplex', 0.0029150224),
 ('Neighborhood_SWISU', 0.0027692046),
 ('RoofMatl_WdShngl', 0.0025965958),
 ('Heating_Grav', 0.0025687595),
 ('LandSlope_Gtl', 0.0025014824),
 ('LandSlope_Mod', 0.0024780459),
 ('FullBath', 0.0024371848),
 ('BldgType_1Fam', 0.0023931647),
 ('Foundation_Stone', 0.0022776753),
 ('MSSubClass_75', 0.0022572486),
 ('MasVnrArea', 0.0022140164),
 ('SaleCondition_Family', 0.002167297),
 ('YrSold_2008', 0.0021614458),
 ('GarageType_Detchd', 0.0021610886),
 ('LandContour_HLS', 0.002159908),
 ('SaleType_WD', 0.002155278),
 ('Neighborhood_Crawfor', 0.002129462),
 ('Neighborhood_Edwards', 0.0020690016),
 ('HalfBath', 0.0019449288),
 ('LandContour_Lvl', 0.0018609057),
 ('Functional_Mod', 0.0018403352),
 ('ExterCond_Fa', 0.001830228),
 ('MasVnrType_BrkFace', 0.001814074),
 ('LotFrontage', 0.0017950683),
 ('ExterQual_Gd', 0.0017648138),
 ('Condition1_Feedr', 0.0017304313),
 ('SaleCondition_Partial', 0.0017095393),
 ('Condition1_RRAe', 0.0017053399),
 ('MSSubClass_20', 0.0016536457),
 ('HeatingQC_Fa', 0.0016335138),
 ('ExterQual_TA', 0.0016250247),
 ('OverallCond', 0.0016106549),
 ('Neighborhood_NAmes', 0.0016082175),
 ('BsmtQual_Fa', 0.0015694612),
 ('Exterior1st_BrkFace', 0.0015455327),
 ('Neighborhood_StoneBr', 0.0015435361),
 ('BsmtFullBath', 0.0014969024),
 ('BsmtExposure_No', 0.0014912919),
 ('Functional_Maj2', 0.0014770095),
 ('Heating_OthW', 0.0014321045),
 ('Neighborhood_Veenker', 0.00142436),
 ('Functional_Typ', 0.0013758234),
 ('Neighborhood_OldTown', 0.001347905),
 ('PavedDrive_Y', 0.001347674),
 ('EnclosedPorch', 0.00134469),
 ('Neighborhood_Sawyer', 0.0012952455),
 ('GarageYrBlt', 0.0012869029),
 ('RoofMatl_CompShg', 0.0011996817),
 ('FireplaceQu_TA', 0.001159448),
 ('MSZoning_RL', 0.0011549005),
 ('PavedDrive_N', 0.001129404),
 ('RoofStyle_Shed', 0.0011210005),
 ('BsmtExposure_Gd', 0.0010897304),
 ('BedroomAbvGr', 0.0010711068),
 ('BsmtCond_Fa', 0.0010670887),
 ('Neighborhood_IDOTRR', 0.0010394471),
 ('MasVnrType_BrkCmn', 0.0010386847),
 ('KitchenQual_Fa', 0.0010143467),
 ('OpenPorchSF', 0.0010134919),
 ('SaleCondition_Abnorml', 0.000997769),
 ('WoodDeckSF', 0.0009796731),
 ('LandSlope_Sev', 0.00096981646),
 ('MSZoning_C (all)', 0.00096615497),
 ('HouseStyle_SLvl', 0.00096104806),
 ('LandContour_Low', 0.00095283776),
 ('Exterior1st_Wd Sdng', 0.00093908736),
 ('BsmtCond_Gd', 0.00092369085),
 ('BsmtUnfSF', 0.0009159962),
 ('BsmtFinSF2', 0.000914709),
 ('Exterior2nd_Plywood', 0.0009035254),
 ('Neighborhood_CollgCr', 0.0008786999),
 ('BsmtQual_TA', 0.00087709253),
 ('PavedDrive_P', 0.0008450014),
 ('BsmtFinType1_Rec', 0.00083076884),
 ('LotConfig_CulDSac', 0.00082237995),
 ('Condition1_Artery', 0.00079686724),
 ('Id', 0.0007814562),
 ('Neighborhood_Timber', 0.0007689896),
 ('MoSold', 0.0007688444),
 ('HeatingQC_TA', 0.0007654603),
 ('Exterior2nd_MetalSd', 0.0007561838),
 ('GarageFinish_Fin', 0.0007506564),
 ('FireplaceQu_Gd', 0.0007478747),
 ('Neighborhood_Somerst', 0.00073150184),
 ('ScreenPorch', 0.00071018364),
 ('Exterior2nd_AsbShng', 0.00069117936),
 ('BldgType_2fmCon', 0.00068497774),
 ('Exterior2nd_Wd Sdng', 0.0006789549),
 ('GarageQual_Fa', 0.0006756631),
 ('Exterior2nd_ImStucc', 0.0006481715),
 ('BsmtFinType1_ALQ', 0.0006466766),
 ('HouseStyle_2Story', 0.00063225214),
 ('Exterior1st_WdShing', 0.00061583845),
 ('Exterior2nd_HdBoard', 0.00061482575),
 ('Exterior1st_HdBoard', 0.0006114065),
 ('HeatingQC_Ex', 0.0006030498),
 ('Condition1_PosN', 0.0005937698),
 ('Exterior2nd_VinylSd', 0.00057505973),
 ('FireplaceQu_Ex', 0.0005464077),
 ('BsmtFinType2_Unf', 0.0005114567),
 ('LowQualFinSF', 0.0005015399),
 ('BsmtCond_TA', 0.0004988435),
 ('LotConfig_FR3', 0.00048852694),
 ('Condition1_Norm', 0.00047821572),
 ('MSSubClass_120', 0.00046705888),
 ('SaleType_ConLD', 0.00045726588),
 ('BsmtExposure_Mn', 0.0004539239),
 ('BsmtFinType1_LwQ', 0.00044687753),
 ('GarageFinish_Unf', 0.00043614442),
 ('BsmtFinType2_ALQ', 0.0004010186),
 ('FireplaceQu_Po', 0.00038877616),
 ('LotConfig_FR2', 0.0003666034),
 ('HeatingQC_Gd', 0.0003664517),
 ('YearBuilt', 0.00036322788),
 ('Exterior1st_VinylSd', 0.0003612518),
 ('BsmtExposure_Av', 0.00035657448),
 ('HouseStyle_1Story', 0.00035240524),
 ('Exterior2nd_CmentBd', 0.0003379002),
 ('Neighborhood_BrkSide', 0.00033439198),
 ('SaleCondition_Normal', 0.00033135305),
 ('PoolArea', 0.00032846778),
 ('LotShape_IR1', 0.00031836148),
 ('LotShape_IR2', 0.0003158487),
 ('BsmtFinType1_BLQ', 0.00029770628),
 ('HouseStyle_1.5Fin', 0.0002949982),
 ('ExterCond_Gd', 0.00029466968),
 ('3SsnPorch', 0.00028137083),
 ('BsmtFinType1_Unf', 0.00025151562),
 ('MasVnrType_None', 0.0002474294),
 ('BsmtHalfBath', 0.00024731937),
 ('SaleType_COD', 0.0002446929),
 ('Functional_Min1', 0.00023831776),
 ('BsmtFinType2_GLQ', 0.0002348463),
 ('YrSold_2010', 0.0002297446),
 ('GarageCond_Fa', 0.00022557852),
 ('RoofMatl_Tar&Grv', 0.00020732886),
 ('GarageQual_Gd', 0.00020601533),
 ('LandContour_Bnk', 0.00020408761),
 ('Foundation_PConc', 0.00020276985),
 ('Condition1_RRAn', 0.00020187993),
 ('BsmtFinType2_Rec', 0.00019516733),
 ('GarageType_BuiltIn', 0.00018795398),
 ('LotConfig_Corner', 0.00017717268),
 ('LotConfig_Inside', 0.00017693448),
 ('Neighborhood_ClearCr', 0.00014557355),
 ('YrSold_2007', 0.0001376982),
 ('BldgType_Twnhs', 0.00013715278),
 ('BsmtFinType2_BLQ', 0.00013597694),
 ('Electrical_SBrkr', 0.00013098777),
 ('GarageType_Basment', 0.00013070596),
 ('YrSold_2009', 0.00012943595),
 ('SaleType_ConLI', 0.00012930483),
 ('Foundation_Slab', 0.0001262763),
 ('Electrical_FuseA', 0.00012433954),
 ('Neighborhood_NridgHt', 0.00011382785),
 ('GarageQual_Ex', 0.00010800753),
 ('Foundation_CBlock', 0.00010498048),
 ('Foundation_BrkTil', 8.812534e-05),
 ('GarageFinish_RFn', 7.479326e-05),
 ('Functional_Min2', 7.3838106e-05),
 ('Functional_Maj1', 6.6195884e-05),
 ('MSSubClass_50', 5.7079727e-05),
 ('YrSold_2006', 5.544721e-05),
 ('Exterior2nd_BrkFace', 4.9046714e-05),
 ('GarageCond_Gd', 4.6189987e-05),
 ('GarageCond_TA', 3.649019e-05),
 ('ExterCond_TA', 3.0958196e-05),
 ('RoofStyle_Gable', 1.6125034e-05),
 ('SaleCondition_Alloca', 1.4236363e-05),
 ('SaleType_ConLw', 1.3619546e-05),
 ('RoofStyle_Hip', 9.960351e-06),
 ('MSZoning_RH', 0.0),
 ('GarageCond_Po', 0.0),
 ('GarageCond_Ex', 0.0),
 ('SaleCondition_AdjLand', 0.0),
 ('GarageQual_Po', 0.0),
 ('LotShape_IR3', 0.0),
 ('MSSubClass_190', 0.0),
 ('SaleType_CWD', 0.0),
 ('SaleType_Con', 0.0),
 ('MSSubClass_160', 0.0),
 ('MSSubClass_150', 0.0),
 ('MSSubClass_90', 0.0),
 ('MSSubClass_85', 0.0),
 ('MSSubClass_80', 0.0),
 ('Street_Grvl', 0.0),
 ('MSSubClass_70', 0.0),
 ('Street_Pave', 0.0),
 ('MSSubClass_45', 0.0),
 ('MSSubClass_40', 0.0),
 ('GarageType_2Types', 0.0),
 ('GarageType_CarPort', 0.0),
 ('SaleType_Oth', 0.0),
 ('MSZoning_FV', 0.0),
 ('MSSubClass_180', 0.0),
 ('Foundation_Wood', 0.0),
 ('Utilities_AllPub', 0.0),
 ('HouseStyle_2.5Fin', 0.0),
 ('HouseStyle_1.5Unf', 0.0),
 ('RoofStyle_Mansard', 0.0),
 ('RoofStyle_Gambrel', 0.0),
 ('RoofStyle_Flat', 0.0),
 ('RoofMatl_WdShake', 0.0),
 ('RoofMatl_Roll', 0.0),
 ('RoofMatl_Metal', 0.0),
 ('RoofMatl_Membran', 0.0),
 ('MiscVal', 0.0),
 ('Exterior1st_Stone', 0.0),
 ('Exterior1st_Plywood', 0.0),
 ('Exterior1st_MetalSd', 0.0),
 ('Exterior1st_ImStucc', 0.0),
 ('Exterior1st_CemntBd', 0.0),
 ('Exterior1st_CBlock', 0.0),
 ('Exterior1st_BrkComm', 0.0),
 ('Exterior1st_AsphShn', 0.0),
 ('Exterior2nd_Wd Shng', 0.0),
 ('Exterior2nd_Stone', 0.0),
 ('Exterior2nd_Other', 0.0),
 ('Exterior2nd_CBlock', 0.0),
 ('Exterior2nd_AsphShn', 0.0),
 ('MasVnrType_Stone', 0.0),
 ('BsmtCond_Po', 0.0),
 ('ExterCond_Po', 0.0),
 ('ExterCond_Ex', 0.0),
 ('HouseStyle_2.5Unf', 0.0),
 ('Utilities_NoSeWa', 0.0),
 ('HouseStyle_SFoyer', 0.0),
 ('Condition2_Artery', 0.0),
 ('Functional_Sev', 0.0),
 ('Electrical_FuseP', 0.0),
 ('Electrical_Mix', 0.0),
 ('Neighborhood_SawyerW', 0.0),
 ('Neighborhood_NoRidge', 0.0),
 ('Neighborhood_NWAmes', 0.0),
 ('Neighborhood_NPkVill', 0.0),
 ('HeatingQC_Po', 0.0),
 ('Neighborhood_Mitchel', 0.0),
 ('Neighborhood_MeadowV', 0.0),
 ('Neighborhood_Gilbert', 0.0),
 ('Heating_Floor', 0.0),
 ('Neighborhood_BrDale', 0.0),
 ('Neighborhood_Blueste', 0.0),
 ('Neighborhood_Blmngtn', 0.0),
 ('Condition1_RRNn', 0.0),
 ('Condition1_RRNe', 0.0),
 ('Heating_GasW', 0.0),
 ('Condition1_PosA', 0.0),
 ('Heating_Wall', 0.0),
 ('Condition2_RRNn', 0.0),
 ('Condition2_RRAn', 0.0),
 ('Condition2_RRAe', 0.0),
 ('Condition2_PosN', 0.0),
 ('Condition2_PosA', 0.0),
 ('Condition2_Norm', 0.0),
 ('Condition2_Feedr', 0.0),
 ('BldgType_TwnhsE', 0.0),
 ('Heating_GasA', 0.0)]

OverallQual

OverallQual feature_importance最高

17.OverallQual:
Rates the overall material and finish of the house 总体质量:评估房屋的整体材料和装饰

10 Very Excellent
9 Excellent
8 Very Good
7 Good
6 Above Average
5 Average
4 Below Average
3 Fair
2 Poor
1 Very Poor

单变量：
如果某个特征与目标高度相关，那么可以根据具体的情况取这个特征的统计值作为新的特征。

# 计数特征
# 统计单个变量数值次数作为新的特征

data_tmp = data.copy()
new_data = data_tmp.groupby(['OverallQual'])['OverallQual'].count().to_frame().rename(columns={'OverallQual':'OverallQual_count'}).reset_index()
data_tmp = pd.merge(data_tmp, new_data, on=['OverallQual'], how='inner')
print(f"OverallQual_count的唯一数据： {data_tmp['OverallQual_count'].unique()}")
train = data_tmp[data_tmp['SalePrice'].notnull()]
test = data_tmp[data_tmp['SalePrice'].isnull()].drop(['SalePrice'],axis=1)
score,_ = xgb_eval(train) 
if score < last_score:
    print('score:',score)
    last_score = score
    data = data_tmp 

print('********') 
print(last_score)

OverallQual_count的唯一数据： [600 731 342 825 107 226  31  40   4  13]
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.8s remaining:    7.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值：{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.15916946763469966
run time is: 9 秒
********
0.1309254483960009

特征选择

xgboost特征重要性

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.feature_selection import SelectFromModel

def xgb_select_features(data):
    start = datetime.datetime.now()
    
    train_df = copy.deepcopy(data)    
    X=train_df.drop(['SalePrice'],axis=1)
    Y=train_df.loc[:,'SalePrice'].values
    
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=666)

    model = xgb.XGBRegressor(
        n_jobs=-1,
        random_state=666
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = np.sqrt(mean_squared_log_error(y_test, y_pred))
    print('score is: ',score)
    
    thresholds = np.sort(model.feature_importances_)
    print(thresholds)
    
    best_score = last_score
    best_thresh = 0
    best_n = 0
    best_selection = 0
    
    for thresh in thresholds:
        # select features using threshold
        selection = SelectFromModel(model, threshold=thresh, prefit=True)
        select_X_train = selection.transform(X_train)
        selection_model = XGBRegressor()
        selection_model.fit(select_X_train, y_train)
        
        select_X_test = selection.transform(X_test)
        y_pred = selection_model.predict(select_X_test)
        score = np.sqrt(mean_squared_log_error(y_test, y_pred))
        print(f'thresh = {thresh}, n = {select_X_train.shape[1]}, score = {score}')
        if score < best_score:
            best_score = score
            best_thresh = thresh
            best_n = select_X_train.shape[1]
            best_selection = selection
    print('**********')
    print('best_score',best_score)
    print('best_thresh',best_thresh)
    print('best_n',best_n)
    print('best_selection',best_selection)
    
    end = datetime.datetime.now()
    print('run time is:',(end-start).seconds,'秒')
    
    return best_score, best_selection

data_tmp = data.copy()
train = data_tmp[data_tmp['SalePrice'].notnull()]
test = data_tmp[data_tmp['SalePrice'].isnull()].drop(['SalePrice'],axis=1)
score, selection = xgb_select_features(train)

score is:  0.1634269756512917
[0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 2.21226992e-06 2.89932450e-06 4.01458919e-06
 4.20125571e-06 4.55453346e-06 9.04142507e-06 9.96066228e-06
 1.00254647e-05 1.19890656e-05 1.24969820e-05 1.33192452e-05
 1.50671058e-05 1.57984032e-05 1.64488301e-05 1.65456859e-05
 1.66506961e-05 1.67086491e-05 1.68370625e-05 1.68880233e-05
 2.24718078e-05 2.45410683e-05 2.50417343e-05 2.65526050e-05
 2.66677962e-05 2.81739158e-05 3.15517100e-05 3.20705658e-05
 3.27122434e-05 3.29928953e-05 3.29968752e-05 3.39528415e-05
 3.73345647e-05 4.07940606e-05 4.34545436e-05 4.40089098e-05
 4.59742078e-05 5.13398518e-05 5.29541248e-05 5.51951016e-05
 5.89233132e-05 5.93937548e-05 6.13122611e-05 6.18340200e-05
 6.73107279e-05 6.82628670e-05 6.88516811e-05 7.01651952e-05
 7.58385213e-05 7.66053636e-05 7.91334314e-05 8.01987990e-05
 8.45546747e-05 8.55193794e-05 8.72424353e-05 9.19281811e-05
 9.60359830e-05 1.03984610e-04 1.07962944e-04 1.08941800e-04
 1.13577124e-04 1.22257770e-04 1.34212489e-04 1.49489177e-04
 1.49857922e-04 1.51855944e-04 1.52076711e-04 1.53393194e-04
 1.59260744e-04 1.67861217e-04 1.76806599e-04 1.80713236e-04
 1.82674266e-04 1.85362616e-04 1.86537334e-04 1.89300932e-04
 1.93503409e-04 2.06448414e-04 2.08053942e-04 2.08536789e-04
 2.09548874e-04 2.10550992e-04 2.15660853e-04 2.23734547e-04
 2.26629345e-04 2.27951896e-04 2.30085352e-04 2.36041815e-04
 2.38105218e-04 2.43984279e-04 2.44529539e-04 2.45253090e-04
 2.50702025e-04 2.56657251e-04 2.64317176e-04 2.67900672e-04
 2.71077937e-04 2.87270523e-04 2.95323494e-04 2.96458253e-04
 3.04090703e-04 3.09348950e-04 3.11128359e-04 3.11715499e-04
 3.27558402e-04 3.29893635e-04 3.30773328e-04 3.39805527e-04
 3.46207613e-04 3.50556540e-04 3.55951575e-04 3.66762193e-04
 3.68454348e-04 3.69566202e-04 3.73052113e-04 4.08736407e-04
 4.34989692e-04 4.35515103e-04 4.49951098e-04 4.62014752e-04
 4.70861385e-04 4.75046923e-04 4.77300986e-04 4.92862833e-04
 4.94032749e-04 5.19220601e-04 5.50425262e-04 5.56649757e-04
 5.63154230e-04 5.79251908e-04 5.94773970e-04 6.21029816e-04
 6.29337505e-04 6.31644973e-04 6.36423938e-04 6.56360935e-04
 6.75858755e-04 6.79097837e-04 7.00469827e-04 7.05525803e-04
 7.35779875e-04 7.47704995e-04 7.81524810e-04 7.98267254e-04
 8.21693102e-04 9.33263858e-04 9.45827691e-04 9.72469221e-04
 9.89662833e-04 9.92853427e-04 1.01081666e-03 1.04294647e-03
 1.04454008e-03 1.04972383e-03 1.05285691e-03 1.05973973e-03
 1.07484148e-03 1.08457590e-03 1.10268581e-03 1.12434081e-03
 1.14494795e-03 1.15139328e-03 1.17061171e-03 1.18661940e-03
 1.24276371e-03 1.29680778e-03 1.31867221e-03 1.44795922e-03
 1.56493811e-03 1.61495444e-03 1.65214005e-03 1.69188995e-03
 1.80765823e-03 1.86003116e-03 1.90357771e-03 1.94085762e-03
 1.96319749e-03 2.10982189e-03 2.21060892e-03 2.53220042e-03
 2.60955282e-03 2.63344147e-03 2.63724546e-03 2.65168841e-03
 2.68658251e-03 2.76857032e-03 2.82136258e-03 2.87449546e-03
 2.99509475e-03 3.02256388e-03 3.12633999e-03 3.13722529e-03
 3.41721484e-03 3.44115961e-03 3.52469017e-03 3.73639143e-03
 3.83520126e-03 3.90244904e-03 4.27996507e-03 4.38119797e-03
 4.49576136e-03 4.64812294e-03 5.13938162e-03 5.47109591e-03
 5.66002959e-03 6.16141642e-03 6.84734341e-03 9.20873415e-03
 1.04834912e-02 1.18552931e-02 1.25425709e-02 1.31127713e-02
 1.38130784e-02 2.41148882e-02 2.83758435e-02 4.22262028e-02
 4.42106090e-02 5.10912351e-02 5.58977053e-02 5.80743290e-02
 6.67559952e-02 3.73314440e-01]
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 2.21226991925505e-06, n = 221, score = 0.1634269756512917
thresh = 2.8993244995945133e-06, n = 220, score = 0.1634269756512917
thresh = 4.01458919441211e-06, n = 219, score = 0.1635835783361062
thresh = 4.20125570599339e-06, n = 218, score = 0.1635789199023971
thresh = 4.5545334614871535e-06, n = 217, score = 0.1635813744412464
thresh = 9.041425073519349e-06, n = 216, score = 0.16361114562337073
thresh = 9.960662282537669e-06, n = 215, score = 0.16359462119264123
thresh = 1.0025464689533692e-05, n = 214, score = 0.1635937181987366
thresh = 1.1989065569650847e-05, n = 213, score = 0.16353696589123407
thresh = 1.2496981980802957e-05, n = 212, score = 0.16375139615480133
thresh = 1.331924522673944e-05, n = 211, score = 0.1637449208155299
thresh = 1.5067105778143741e-05, n = 210, score = 0.1637449208155299
thresh = 1.57984031829983e-05, n = 209, score = 0.1642321668826825
thresh = 1.6448830137960613e-05, n = 208, score = 0.16405079653996815
thresh = 1.654568586673122e-05, n = 207, score = 0.1632326678272285
thresh = 1.665069612499792e-05, n = 206, score = 0.16252760604953023
thresh = 1.6708649127394892e-05, n = 205, score = 0.16240455414022192
thresh = 1.6837062503327616e-05, n = 204, score = 0.1633090720960328
thresh = 1.6888023310457356e-05, n = 203, score = 0.16214144060679214
thresh = 2.2471807824331336e-05, n = 202, score = 0.16326741926876243
thresh = 2.454106834193226e-05, n = 201, score = 0.16326741926876243
thresh = 2.5041734261321835e-05, n = 200, score = 0.16309584143571731
thresh = 2.655260504980106e-05, n = 199, score = 0.16310408282125827
thresh = 2.6667796191759408e-05, n = 198, score = 0.16310408282125827
thresh = 2.817391577991657e-05, n = 197, score = 0.16310408282125827
thresh = 3.155170998070389e-05, n = 196, score = 0.16339547132461527
thresh = 3.2070565794128925e-05, n = 195, score = 0.16339956137815564
thresh = 3.271224341006018e-05, n = 194, score = 0.16339956137815564
thresh = 3.299289528513327e-05, n = 193, score = 0.16352218599509166
thresh = 3.299687523394823e-05, n = 192, score = 0.16353020221215883
thresh = 3.395284147700295e-05, n = 191, score = 0.16401581124232292
thresh = 3.733456469490193e-05, n = 190, score = 0.16401581124232292
thresh = 4.0794060623738915e-05, n = 189, score = 0.1633548655544686
thresh = 4.345454362919554e-05, n = 188, score = 0.1625988150476915
thresh = 4.40089097537566e-05, n = 187, score = 0.16251178915083403
thresh = 4.597420775098726e-05, n = 186, score = 0.16273098610625014
thresh = 5.133985177963041e-05, n = 185, score = 0.1623748258496059
thresh = 5.29541248397436e-05, n = 184, score = 0.16306568065478022
thresh = 5.519510159501806e-05, n = 183, score = 0.16306568065478022
thresh = 5.892331319046207e-05, n = 182, score = 0.16306545807539408
thresh = 5.9393754781922325e-05, n = 181, score = 0.1627641191475676
thresh = 6.131226109573618e-05, n = 180, score = 0.16232541907143588
thresh = 6.183402001624927e-05, n = 179, score = 0.16165375976178925
thresh = 6.731072789989412e-05, n = 178, score = 0.1616147201897199
thresh = 6.826286698924378e-05, n = 177, score = 0.16225508227129698
thresh = 6.885168113512918e-05, n = 176, score = 0.16186094233915702
thresh = 7.016519521130249e-05, n = 175, score = 0.16255273603058806
thresh = 7.583852129755542e-05, n = 174, score = 0.16255273603058806
thresh = 7.660536357434466e-05, n = 173, score = 0.16240791832631746
thresh = 7.913343142718077e-05, n = 172, score = 0.16408492230212596
thresh = 8.019879896892235e-05, n = 171, score = 0.16554639191158738
thresh = 8.45546746859327e-05, n = 170, score = 0.16412678725386642
thresh = 8.551937935408205e-05, n = 169, score = 0.16392377325903976
thresh = 8.72424352564849e-05, n = 168, score = 0.16501163023644486
thresh = 9.192818106384948e-05, n = 167, score = 0.1612071772276545
thresh = 9.603598300600424e-05, n = 166, score = 0.16168144378177213
thresh = 0.00010398461017757654, n = 165, score = 0.1608330671877611
thresh = 0.00010796294372994453, n = 164, score = 0.160868133700044
thresh = 0.00010894180013565347, n = 163, score = 0.16192426505369828
thresh = 0.00011357712355675176, n = 162, score = 0.16155774994131725
thresh = 0.00012225777027197182, n = 161, score = 0.162509940499741
thresh = 0.00013421248877421021, n = 160, score = 0.16242071573314554
thresh = 0.0001494891766924411, n = 159, score = 0.1643892302937634
thresh = 0.00014985792222432792, n = 158, score = 0.1643892302937634
thresh = 0.00015185594384092838, n = 157, score = 0.1638682109182789
thresh = 0.00015207671094685793, n = 156, score = 0.16469652920000286
thresh = 0.00015339319361373782, n = 155, score = 0.16305642519498012
thresh = 0.00015926074411254376, n = 154, score = 0.16283903028472813
thresh = 0.00016786121705081314, n = 153, score = 0.16314592703845424
thresh = 0.00017680659948382527, n = 152, score = 0.1643053028990939
thresh = 0.0001807132357498631, n = 151, score = 0.16356331612515632
thresh = 0.00018267426639795303, n = 150, score = 0.16434283180710632
thresh = 0.00018536261632107198, n = 149, score = 0.16434283180710632
thresh = 0.00018653733422979712, n = 148, score = 0.1653295125571202
thresh = 0.00018930093210656196, n = 147, score = 0.1625449962063003
thresh = 0.00019350340880919248, n = 146, score = 0.16299693193547804
thresh = 0.0002064484142465517, n = 145, score = 0.16361440676890054
thresh = 0.00020805394160561264, n = 144, score = 0.16368977688955758
thresh = 0.00020853678870480508, n = 143, score = 0.16255519616685787
thresh = 0.000209548874408938, n = 142, score = 0.1641198542763368
thresh = 0.00021055099205113947, n = 141, score = 0.16325382141903572
thresh = 0.0002156608534278348, n = 140, score = 0.16325382141903572
thresh = 0.0002237345470348373, n = 139, score = 0.16173847547448797
thresh = 0.0002266293449793011, n = 138, score = 0.1623182921433018
thresh = 0.00022795189579483122, n = 137, score = 0.16233337429418535
thresh = 0.00023008535208646208, n = 136, score = 0.16159519657109647
thresh = 0.00023604181478731334, n = 135, score = 0.163579003818478
thresh = 0.00023810521815903485, n = 134, score = 0.16061957410222374
thresh = 0.00024398427922278643, n = 133, score = 0.16596828868389774
thresh = 0.00024452953948639333, n = 132, score = 0.1657755657445966
thresh = 0.0002452530898153782, n = 131, score = 0.16395945945497026
thresh = 0.0002507020253688097, n = 130, score = 0.16450701322995906
thresh = 0.00025665725115686655, n = 129, score = 0.16367981639020554
thresh = 0.00026431717560626566, n = 128, score = 0.16473021810564228
thresh = 0.00026790067204274237, n = 127, score = 0.16396632144606818
thresh = 0.000271077937213704, n = 126, score = 0.16396632144606818
thresh = 0.0002872705226764083, n = 125, score = 0.16354736923901833
thresh = 0.0002953234943561256, n = 124, score = 0.1652204162014962
thresh = 0.00029645825270563364, n = 123, score = 0.16373418031526485
thresh = 0.0003040907031390816, n = 122, score = 0.16374166529757242
thresh = 0.0003093489503953606, n = 121, score = 0.16578630155126245
thresh = 0.0003111283585894853, n = 120, score = 0.16599253432773428
thresh = 0.00031171549926511943, n = 119, score = 0.1652697314394485
thresh = 0.0003275584022048861, n = 118, score = 0.1649785715129932
thresh = 0.0003298936353530735, n = 117, score = 0.16558838159126518
thresh = 0.0003307733277324587, n = 116, score = 0.16380566984753903
thresh = 0.0003398055268917233, n = 115, score = 0.16559102459099373
thresh = 0.0003462076128926128, n = 114, score = 0.16293950390118359
thresh = 0.0003505565400701016, n = 113, score = 0.16566089599854528
thresh = 0.00035595157532952726, n = 112, score = 0.1632106660019096
thresh = 0.000366762193152681, n = 111, score = 0.16305305978066092
thresh = 0.0003684543480630964, n = 110, score = 0.16447802792574603
thresh = 0.000369566201698035, n = 109, score = 0.16509355390365787
thresh = 0.0003730521129909903, n = 108, score = 0.16411459468982148
thresh = 0.00040873640682548285, n = 107, score = 0.16374926836482503
thresh = 0.0004349896917119622, n = 106, score = 0.16367785169581325
thresh = 0.00043551510316319764, n = 105, score = 0.1636866731052735
thresh = 0.0004499510978348553, n = 104, score = 0.1616459879131297
thresh = 0.00046201475197449327, n = 103, score = 0.16396574130177524
thresh = 0.0004708613851107657, n = 102, score = 0.1635921707647243
thresh = 0.0004750469233840704, n = 101, score = 0.1635921707647243
thresh = 0.00047730098594911397, n = 100, score = 0.16399328347802072
thresh = 0.0004928628331981599, n = 99, score = 0.16564605568682592
thresh = 0.0004940327489748597, n = 98, score = 0.16325210406804963
thresh = 0.000519220600835979, n = 97, score = 0.16261771100132597
thresh = 0.0005504252621904016, n = 96, score = 0.1634582229497553
thresh = 0.0005566497566178441, n = 95, score = 0.16179959210227152
thresh = 0.0005631542298942804, n = 94, score = 0.16451741441940848
thresh = 0.0005792519077658653, n = 93, score = 0.16380661635734156
thresh = 0.0005947739700786769, n = 92, score = 0.16174063443377779
thresh = 0.0006210298161022365, n = 91, score = 0.16405510686507763
thresh = 0.0006293375045061111, n = 90, score = 0.16201802505482488
thresh = 0.0006316449726000428, n = 89, score = 0.16227768314190222
thresh = 0.0006364239379763603, n = 88, score = 0.1613357753838816
thresh = 0.0006563609349541366, n = 87, score = 0.15489379676447132
thresh = 0.0006758587551303208, n = 86, score = 0.15996636687018856
thresh = 0.0006790978368371725, n = 85, score = 0.16409704605134612
thresh = 0.0007004698272794485, n = 84, score = 0.1611161735435722
thresh = 0.0007055258029140532, n = 83, score = 0.15680771378798877
thresh = 0.0007357798749580979, n = 82, score = 0.15924861727829498
thresh = 0.0007477049948647618, n = 81, score = 0.15728230052812234
thresh = 0.0007815248100087047, n = 80, score = 0.15719261836399
thresh = 0.000798267254140228, n = 79, score = 0.16172815389738718
thresh = 0.0008216931018978357, n = 78, score = 0.15949682699106826
thresh = 0.0009332638583146036, n = 77, score = 0.16160607772821436
thresh = 0.0009458276908844709, n = 76, score = 0.16001279746476574
thresh = 0.0009724692208692431, n = 75, score = 0.16216081247759648
thresh = 0.0009896628325805068, n = 74, score = 0.16189270681592866
thresh = 0.0009928534273058176, n = 73, score = 0.16246173173728296
thresh = 0.0010108166607096791, n = 72, score = 0.1592728379608147
thresh = 0.0010429464746266603, n = 71, score = 0.15917095371048154
thresh = 0.0010445400839671493, n = 70, score = 0.15930647656578448
thresh = 0.001049723825417459, n = 69, score = 0.15966212326617918
thresh = 0.0010528569109737873, n = 68, score = 0.16189855076065846
thresh = 0.0010597397340461612, n = 67, score = 0.16183779727966732
thresh = 0.0010748414788395166, n = 66, score = 0.16183779727966732
thresh = 0.0010845758952200413, n = 65, score = 0.16008020907150441
thresh = 0.0011026858119294047, n = 64, score = 0.16718195491600213
thresh = 0.001124340808019042, n = 63, score = 0.16039879801638024
thresh = 0.001144947949796915, n = 62, score = 0.1611256611827503
thresh = 0.0011513932840898633, n = 61, score = 0.1611256611827503
thresh = 0.0011706117074936628, n = 60, score = 0.16379329884078375
thresh = 0.0011866193963214755, n = 59, score = 0.16408328930169308
thresh = 0.0012427637120708823, n = 58, score = 0.16537973221694566
thresh = 0.0012968077789992094, n = 57, score = 0.16442913579291546
thresh = 0.0013186722062528133, n = 56, score = 0.16507222814151795
thresh = 0.0014479592209681869, n = 55, score = 0.16703247619403705
thresh = 0.0015649381093680859, n = 54, score = 0.16808817122288616
thresh = 0.0016149544389918447, n = 53, score = 0.16466868303634474
thresh = 0.0016521400539204478, n = 52, score = 0.16458397126306654
thresh = 0.001691889949142933, n = 51, score = 0.1666959750184447
thresh = 0.0018076582346111536, n = 50, score = 0.16842319933852176
thresh = 0.001860031159594655, n = 49, score = 0.1694556190927747
thresh = 0.001903577707707882, n = 48, score = 0.16781088785246723
thresh = 0.001940857619047165, n = 47, score = 0.17114455865700695
thresh = 0.00196319748647511, n = 46, score = 0.16559975270137217
thresh = 0.0021098218858242035, n = 45, score = 0.16636048839832915
thresh = 0.0022106089163571596, n = 44, score = 0.16228946266195293
thresh = 0.0025322004221379757, n = 43, score = 0.16706401646520208
thresh = 0.002609552815556526, n = 42, score = 0.16519764397156145
thresh = 0.002633441472426057, n = 41, score = 0.16357918623510773
thresh = 0.002637245459482074, n = 40, score = 0.16505931778783206
thresh = 0.0026516884099692106, n = 39, score = 0.16802911621134767
thresh = 0.0026865825057029724, n = 38, score = 0.16695704165547676
thresh = 0.002768570324406028, n = 37, score = 0.1676591145615499
thresh = 0.00282136257737875, n = 36, score = 0.16975460824029226
thresh = 0.0028744954615831375, n = 35, score = 0.16330457796774298
thresh = 0.0029950947500765324, n = 34, score = 0.17694671133982054
thresh = 0.0030225638765841722, n = 33, score = 0.17648102502023405
thresh = 0.0031263399869203568, n = 32, score = 0.18014432945641684
thresh = 0.0031372252851724625, n = 31, score = 0.17931339793706613
thresh = 0.0034172148443758488, n = 30, score = 0.17955741533055633
thresh = 0.0034411596134305, n = 29, score = 0.17655572295901728
thresh = 0.003524690167978406, n = 28, score = 0.17824446649492398
thresh = 0.003736391430720687, n = 27, score = 0.17675593365353548
thresh = 0.0038352012634277344, n = 26, score = 0.17828571170297894
thresh = 0.0039024490397423506, n = 25, score = 0.1710506549793859
thresh = 0.004279965069144964, n = 24, score = 0.1747581895578411
thresh = 0.0043811979703605175, n = 23, score = 0.17549665567733588
thresh = 0.0044957613572478294, n = 22, score = 0.17530776404500276
thresh = 0.004648122936487198, n = 21, score = 0.17766274634743445
thresh = 0.0051393816247582436, n = 20, score = 0.17815268262237638
thresh = 0.005471095908433199, n = 19, score = 0.17804171380249728
thresh = 0.005660029593855143, n = 18, score = 0.17956257680284474
thresh = 0.006161416415125132, n = 17, score = 0.18127994476276035
thresh = 0.006847343407571316, n = 16, score = 0.18390724420240004
thresh = 0.009208734147250652, n = 15, score = 0.18680562071335757
thresh = 0.010483491234481335, n = 14, score = 0.18684297119149312
thresh = 0.011855293065309525, n = 13, score = 0.18591620833137226
thresh = 0.012542570941150188, n = 12, score = 0.18655437837361516
thresh = 0.013112771324813366, n = 11, score = 0.187595422816317
thresh = 0.0138130784034729, n = 10, score = 0.18590297656149943
thresh = 0.024114888161420822, n = 9, score = 0.20589942195000205
thresh = 0.028375843539834023, n = 8, score = 0.202828827635469
thresh = 0.04222620278596878, n = 7, score = 0.21003656282444372
thresh = 0.044210609048604965, n = 6, score = 0.2185621315162222
thresh = 0.051091235131025314, n = 5, score = 0.2191857107199869
thresh = 0.055897705256938934, n = 4, score = 0.21340924911246614
thresh = 0.05807432904839516, n = 3, score = 0.213585922331518
thresh = 0.0667559951543808, n = 2, score = 0.22994426940573962
thresh = 0.37331444025039673, n = 1, score = 0.23095892562782216
**********
best_score 0.1309254483960009
best_thresh 0
best_n 0
best_selection 0
run time is: 262 秒

计算特征与目标的相关系数以及P值

# 相关系数——特征与目标变量
from sklearn.feature_selection import SelectKBest
from scipy.stats import pearsonr
def xgb_eval2(X, Y):
    start = datetime.datetime.now()
    
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=666)

    model = xgb.XGBRegressor(
        n_jobs=-1,
        random_state=666
    )
    
    param_grid = {
        'learning_rate': [0.1],
        'n_estimators': [100]
    }
    gridsearch = GridSearchCV(
        model, 
        param_grid=param_grid, 
        cv=5, 
        verbose=3, 
        n_jobs=-1
    )
    gridsearch.fit(X_train, y_train)
     
    print('best param is: ', gridsearch.best_params_)
    model = gridsearch.best_estimator_
    y_pred = model.predict(X_test)
    score = np.sqrt(mean_squared_log_error(y_test, y_pred))
    print('score is: ',score)
    end = datetime.datetime.now()
    print('run time is:',(end-start).seconds,'秒')
    
    return score, model


data_tmp = data.copy()
train = data_tmp[data_tmp['SalePrice'].notnull()]
X=train.drop(['SalePrice'],axis=1)
Y=train.loc[:,'SalePrice'].values

fun = lambda X, Y: tuple(map(tuple, np.array(list(map(lambda x: pearsonr(x, Y), X.T))).T))
for ki in range(1, len(train.columns.tolist())):
    sb = SelectKBest(fun, k=ki)
    x_fit = sb.fit(X, Y)
    x_sb = x_fit.transform(X)
    X_newcolumnsname = train.iloc[:, x_fit.get_support(indices=True)].columns.tolist()
    #     print('>>>检验统计值(相关系数)：\n', sb.scores_)
    #     print('\n>>>P值：\n', sb.pvalues_)
    score,_ = xgb_eval2(x_sb, Y) 
    if score < last_score:
        print('score: ',score)
        print('X_newcolumnsname: ',X_newcolumnsname)
        print(f'特征个数{len(X_newcolumnsname)}')
        last_score = score
        X_newcolumnsname.append('SalePrice')
        data = data_tmp[X_newcolumnsname]       

print('********') 
print(last_score)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


best param is:  {'learning_rate': 0.1, 'n_estimators': 100}
score is:  0.23095657084432536
run time is: 0 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished


best param is:  {'learning_rate': 0.1, 'n_estimators': 100}
score is:  0.22204959452117293
run time is: 0 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits

… …

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    3.4s remaining:    5.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.3s finished


best param is:  {'learning_rate': 0.1, 'n_estimators': 100}
score is:  0.1509751277940482
run time is: 6 秒
********
0.1309254483960009

len(data.columns)

train = data[data['SalePrice'].notnull()]
test = data[data['SalePrice'].isnull()]
# data.to_csv('data__after_feature_engineering.csv',index=False)
# train.to_csv('train_after_feature_engineering.csv',index=False)
# test.to_csv('test_after_feature_engineering.csv',index=False)

模型调优

确定一个固定的学习率(学习率越小，时间成本越高）:

learning_rate的取值范围一般在[0.01,0.3]之间

如果时间充裕，可以把learning_rate设置的更小；

如果时间紧张，可以把learning_rate设置的更大；

1）调试n_estimators；

2）调试min_child_weight以及max_depth；

3）调试gamma；

4）调试subsample、colsample_bytree；

5）调试正则化参数：reg_alpha、reg_lambda；

# 自定义scoring
# https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
from sklearn.metrics import make_scorer
def xgb_eval(data, cv_params, other_params):
    def my_error_func(y_ture, y_pred):
        error = np.sqrt(mean_squared_log_error(y_ture, y_pred))
        return error
    my_score = make_scorer(my_error_func, greater_is_better=False)
    
    start = datetime.datetime.now()
    
    train_df = copy.deepcopy(data)    
    X_train=train_df.drop(['SalePrice'],axis=1)
    y_train=train_df.loc[:,'SalePrice'].values
    
#     X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=666)

    xgb_reg = xgb.XGBRegressor(**other_params)
    
    param_grid = {
        'learning_rate': [0.1],
        'n_estimators': [100]
    }
    gridsearch = GridSearchCV(
        estimator=xgb_reg, 
        param_grid=cv_params, 
        scoring=my_score,
        cv=5, 
        verbose=3, 
        n_jobs=-1
    )
    gridsearch.fit(X_train, y_train)
    print('参数的最佳取值：{0}'.format(gridsearch.best_params_))
    print('最佳模型得分:{0}'.format(-gridsearch.best_score_))
    
    end = datetime.datetime.now()
    print('run time is:',(end-start).seconds,'秒')
    
    return -gridsearch.best_score_, gridsearch.cv_results_

调n_estimators

## 粗调
#粗调
cv_params = {
    'n_estimators': np.arange(10, 1500, 50),
}
other_params = {
    'learning_rate': 0.1, 
    'max_depth': 5, 
    'min_child_weight': 1, 
    'subsample': 0.8, 
    'colsample_bytree': 0.8, 
    'gamma': 0, 
    'reg_alpha': 0, 
    'reg_lambda': 1,
    'n_jobs':-1,
    'random_state':666
}

data_tmp = data.copy()
train = data_tmp[data_tmp['SalePrice'].notnull()]
score, cv_results = xgb_eval(train, cv_params, other_params)
plt.plot(np.arange(10, 1500, 50), cv_results['mean_test_score'])
plt.show()

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   26.8s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed: 10.4min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 16.2min finished


参数的最佳取值：{'n_estimators': 460}
最佳模型得分:0.1288088360602727
run time is: 977 秒

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-NUHSubOP-1615804774093)(output_273_3.png)]

# 参数的最佳取值：{'n_estimators': 460}
# 最佳模型得分:0.1288088360602727

# 细调
cv_params = {
    'n_estimators': np.arange(460-50, 460+50,1),
}
other_params = {
    'learning_rate': 0.1, 
    'max_depth': 5, 
    'min_child_weight': 1, 
    'subsample': 0.8, 
    'colsample_bytree': 0.8, 
    'gamma': 0, 
    'reg_alpha': 0, 
    'reg_lambda': 1,
    'n_jobs':-1,
    'random_state':666
}
data_tmp = data.copy()
train = data_tmp[data_tmp['SalePrice'].notnull()]
score, cv_results = xgb_eval(train, cv_params, other_params)
plt.plot(np.arange(460-50, 460+50,1), cv_results['mean_test_score'])
plt.show()

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 16.9min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 31.1min finished


参数的最佳取值：{'n_estimators': 452}
最佳模型得分:0.12878427962473019
run time is: 1870 秒

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-IBbJsmb6-1615804774096)(output_275_3.png)]

调max_depth, min_child_weight

cv_params = {
    'max_depth':range(1,10,1),
    'min_child_weight':range(1,10,1)
}
other_params = {
    'learning_rate': 0.1, 
    'n_estimators':452,
    'max_depth': 5, 
    'min_child_weight': 1, 
    'subsample': 0.8, 
    'colsample_bytree': 0.8, 
    'gamma': 0, 
    'reg_alpha': 0, 
    'reg_lambda': 1,
    'n_jobs':-1,
    'random_state':666
}
train = data[data['SalePrice'].notnull()]
score, cv_results = xgb_eval(train, cv_params, other_params)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   22.9s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 12.6min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed: 24.3min finished


参数的最佳取值：{'max_depth': 4, 'min_child_weight': 4}
最佳模型得分:0.12662196605686601
run time is: 1460 秒

调gamma

cv_params = {
    'gamma':[i/10.0 for i in range(0,5)]
}

other_params = {
    'learning_rate': 0.1, 
    'n_estimators':452,
    'max_depth': 4, 
    'min_child_weight': 4, 
    'subsample': 0.8, 
    'colsample_bytree': 0.8, 
    'gamma': 0, 
    'reg_alpha': 0, 
    'reg_lambda': 1,
    'n_jobs':-1,
    'random_state':666
}

train = data[data['SalePrice'].notnull()]
score, cv_results = xgb_eval(train, cv_params, other_params)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  1.4min finished


参数的最佳取值：{'gamma': 0.0}
最佳模型得分:0.12662196605686601
run time is: 89 秒

调优subsample 和 colsample_bytree

cv_params = {
    'subsample':[i/10.0 for i in range(4,10)],
    'colsample_bytree':[i/10.0 for i in range(4,10)]
}
other_params = {
    'learning_rate': 0.1, 
    'n_estimators':452,
    'max_depth': 4, 
    'min_child_weight': 4, 
    'subsample': 0.8, 
    'colsample_bytree': 0.8, 
    'gamma': 0, 
    'reg_alpha': 0, 
    'reg_lambda': 1,
    'n_jobs':-1,
    'random_state':666
}

train = data[data['SalePrice'].notnull()]
score, cv_results = xgb_eval(train, cv_params, other_params)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   38.8s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  7.3min finished


参数的最佳取值：{'colsample_bytree': 0.6, 'subsample': 0.9}
最佳模型得分:0.1248652563014746
run time is: 439 秒

调优正则化参数

cv_params = {
    'reg_alpha': [0.05, 0.1, 1, 2, 3, 10, 50, 100], 
    'reg_lambda': [0.05, 0.1, 1, 2, 3]
}
other_params = {
    'learning_rate': 0.1, 
    'n_estimators':452,
    'max_depth': 4, 
    'min_child_weight': 4, 
    'subsample': 0.9, 
    'colsample_bytree': 0.6, 
    'gamma': 0, 
    'reg_alpha': 0, 
    'reg_lambda': 1,
    'n_jobs':-1,
    'random_state':666
}

train = data[data['SalePrice'].notnull()]
score, cv_results = xgb_eval(train, cv_params, other_params)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   52.2s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  7.8min finished


参数的最佳取值：{'reg_alpha': 1, 'reg_lambda': 1}
最佳模型得分:0.1246989760441339
run time is: 471 秒

##至此，XGBoost模型调参结束，确定了最优参数和模型：
best_params = {
    'learning_rate': 0.1, 
    'n_estimators':452,
    'max_depth': 4, 
    'min_child_weight': 4, 
    'subsample': 0.9, 
    'colsample_bytree': 0.6, 
    'gamma': 0, 
    'reg_alpha': 1, 
    'reg_lambda': 1,
    'n_jobs':-1,
    'random_state':666
}
best_xgb = xgb.XGBRegressor(**best_params)

X_train = data[data['SalePrice'].notnull()].drop(['SalePrice'],axis=1)
Y_train = data[data['SalePrice'].notnull()].loc[:,'SalePrice'].values
X_test = data[data['SalePrice'].isnull()].drop(['SalePrice'], axis=1)

print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)

(1460, 294)
(1460,)
(1459, 294)

模型集成（融合）

基模型xgboost，使用bagging进行融合

from sklearn.ensemble import BaggingRegressor
def my_error_func(y_ture, y_pred):
    error = np.sqrt(mean_squared_log_error(y_ture, y_pred))
    return error
my_score = make_scorer(my_error_func, greater_is_better=False)
regr = BaggingRegressor(
    base_estimator = best_xgb,
    n_jobs = -1,
    random_state = 66
)
model = GridSearchCV(
        estimator=regr, 
        param_grid={'n_estimators':np.arange(1,20,1)}, 
        scoring=my_score, 
        cv=5, 
        verbose=3, 
        n_jobs=-1
)
model.fit(X_train, Y_train)
print('参数的最佳取值：{0}'.format(model.best_params_))
print('最佳模型得分:{0}'.format(-model.best_score_))

Fitting 5 folds for each of 19 candidates, totalling 95 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  95 out of  95 | elapsed: 38.4min finished


参数的最佳取值：{'n_estimators': 19}
最佳模型得分:0.12425136280271949

model.best_estimator_

BaggingRegressor(base_estimator=XGBRegressor(base_score=None, booster=None,
                                             colsample_bylevel=None,
                                             colsample_bynode=None,
                                             colsample_bytree=0.6, gamma=0,
                                             gpu_id=None,
                                             importance_type='gain',
                                             interaction_constraints=None,
                                             learning_rate=0.1,
                                             max_delta_step=None, max_depth=4,
                                             min_child_weight=4, missing=nan,
                                             monotone_constraints=None,
                                             n_estimators=452, n_jobs=-1,
                                             num_parallel_tree=None,
                                             random_state=666, reg_alpha=1,
                                             reg_lambda=1,
                                             scale_pos_weight=None,
                                             subsample=0.9, tree_method=None,
                                             validate_parameters=None,
                                             verbosity=None),
                 n_estimators=19, n_jobs=-1, random_state=66)

y_pred = model.best_estimator_.predict(X_test)

y_pred

array([125902.945, 163661.55 , 183499.3  , ..., 162691.47 , 113986.79 ,
       220476.8  ], dtype=float32)

y_df = pd.DataFrame(data=y_pred,columns=['SalePrice'])

y_df.head()

	SalePrice
0	125902.945312
1	163661.546875
2	183499.296875
3	191795.031250
4	186873.843750

submission = pd.concat([X_test['Id'],y_df],axis=1)

submission.head()

	Id	SalePrice
0	1461	125902.945312
1	1462	163661.546875
2	1463	183499.296875
3	1464	191795.031250
4	1465	186873.843750

submission.to_csv('submission.csv',index=False)

# from sklearn.metrics import mean_squared_log_error

# RMSLE = np.sqrt( mean_squared_log_error(y_true, y_pred) )

# print("The score is %.5f" % RMSLE )

# train = data[data['SalePrice'].notnull()]
# test = data[data['SalePrice'].isnull()]
train.to_csv('./xgb1/train_after_fe.csv',index=False)
test.to_csv('./xgb1/test_after_fe.csv',index=False)

BernadetteDi

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
kaggle住房预测项目——第2部分(bagging)

kaggle住房预测项目——第2部分基线模型import xgboost as xgbimport copyimport datetime,timefrom sklearn.model_selection import train_test_splitfrom sklearn.metrics import mean_squared_log_error4from sklearn.metrics import make_scorerdef xgb_eval(data): def my
复制链接

扫一扫

专栏目录