kaggle住房预测项目——第2部分(bagging)

kaggle住房预测项目——第2部分(bagging)

基线模型

import xgboost as xgb
import copy
import datetime,time
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
4

from sklearn.metrics import make_scorer
def xgb_eval(data):
    def my_error_func(y_ture, y_pred):
        error = np.sqrt(mean_squared_log_error(y_ture, y_pred))
        return error
    my_score = make_scorer(my_error_func, greater_is_better=False)
    
    start = datetime.datetime.now()
    
    train_df = copy.deepcopy(data)    
    X_train=train_df.drop(['SalePrice'],axis=1)
    y_train=train_df.loc[:,'SalePrice'].values
    
    model = xgb.XGBRegressor(
        n_jobs=-1,
        random_state=666
    )
    
    param_grid = {
        'learning_rate': [0.1],
        'n_estimators': [100]
    }
    gridsearch = GridSearchCV(
        model, 
        param_grid=param_grid, 
        scoring=my_score,
        cv=5, 
        verbose=3, 
        n_jobs=-1
    )
    gridsearch.fit(X_train, y_train)
    
    print('参数的最佳取值:{0}'.format(gridsearch.best_params_))
    print('最佳模型得分:{0}'.format(-gridsearch.best_score_))
    
    end = datetime.datetime.now()
    print('run time is:',(end-start).seconds,'秒')
    
    return -gridsearch.best_score_, gridsearch.cv_results_

数据预处理

data = pd.concat([data_train, data_test], axis=0)
data.shape
(2919, 81)

缺失值处理

missing_data(data)
TotalPercent
PoolQC290999.657417
MiscFeature281496.402878
Alley272193.216855
Fence234880.438506
SalePrice145949.982871
FireplaceQu142048.646797
LotFrontage48616.649538
GarageQual1595.447071
GarageYrBlt1595.447071
GarageFinish1595.447071
GarageCond1595.447071
GarageType1575.378554
BsmtExposure822.809181
BsmtCond822.809181
BsmtQual812.774923
BsmtFinType2802.740665
BsmtFinType1792.706406
MasVnrType240.822199
MasVnrArea230.787941
MSZoning40.137033
Utilities20.068517
Functional20.068517
BsmtFullBath20.068517
BsmtHalfBath20.068517
GarageArea10.034258
BsmtFinSF210.034258
Exterior1st10.034258
TotalBsmtSF10.034258
GarageCars10.034258
BsmtUnfSF10.034258
Electrical10.034258
BsmtFinSF110.034258
KitchenQual10.034258
SaleType10.034258
Exterior2nd10.034258
Street00.000000
RoofMatl00.000000
MSSubClass00.000000
LotArea00.000000
OverallCond00.000000
RoofStyle00.000000
YearRemodAdd00.000000
YearBuilt00.000000
OverallQual00.000000
HouseStyle00.000000
BldgType00.000000
Condition200.000000
Condition100.000000
LandSlope00.000000
LotShape00.000000
LandContour00.000000
LotConfig00.000000
Neighborhood00.000000
HeatingQC00.000000
ExterQual00.000000
TotRmsAbvGrd00.000000
YrSold00.000000
MoSold00.000000
MiscVal00.000000
PoolArea00.000000
ScreenPorch00.000000
3SsnPorch00.000000
EnclosedPorch00.000000
OpenPorchSF00.000000
WoodDeckSF00.000000
PavedDrive00.000000
Fireplaces00.000000
KitchenAbvGr00.000000
ExterCond00.000000
BedroomAbvGr00.000000
HalfBath00.000000
FullBath00.000000
GrLivArea00.000000
LowQualFinSF00.000000
2ndFlrSF00.000000
1stFlrSF00.000000
CentralAir00.000000
SaleCondition00.000000
Heating00.000000
Foundation00.000000
Id00.000000
直接删除处理
# 删除属性
def delete_feature(df):
    N = df.shape[0]  # 样本数
    no_nan_count = df.count().to_frame().T  # 每一维特征非缺失值的数量
    del_feature, save_feature = [], []
    for col in no_nan_count.columns.tolist():
        loss_rate = (N - no_nan_count[col].values[0])/N  # 缺失率
        # print(loss_rate)
        if loss_rate >= 0.8:  # 缺失率大于 80% 时,将这一维特征删除
            del_feature.append(col)
        else:
            save_feature.append(col)
    return del_feature, df[save_feature]

del_feature, data = delete_feature(data)
print(del_feature)
data.head()
['Alley', 'PoolQC', 'Fence', 'MiscFeature']
IdMSSubClassMSZoningLotFrontageLotAreaStreetLotShapeLandContourUtilitiesLotConfigLandSlopeNeighborhoodCondition1Condition2BldgTypeHouseStyleOverallQualOverallCondYearBuiltYearRemodAddRoofStyleRoofMatlExterior1stExterior2ndMasVnrTypeMasVnrAreaExterQualExterCondFoundationBsmtQualBsmtCondBsmtExposureBsmtFinType1BsmtFinSF1BsmtFinType2BsmtFinSF2BsmtUnfSFTotalBsmtSFHeatingHeatingQCCentralAirElectrical1stFlrSF2ndFlrSFLowQualFinSFGrLivAreaBsmtFullBathBsmtHalfBathFullBathHalfBathBedroomAbvGrKitchenAbvGrKitchenQualTotRmsAbvGrdFunctionalFireplacesFireplaceQuGarageTypeGarageYrBltGarageFinishGarageCarsGarageAreaGarageQualGarageCondPavedDriveWoodDeckSFOpenPorchSFEnclosedPorch3SsnPorchScreenPorchPoolAreaMiscValMoSoldYrSoldSaleTypeSaleConditionSalePrice
0160RL65.08450PaveRegLvlAllPubInsideGtlCollgCrNormNorm1Fam2Story7520032003GableCompShgVinylSdVinylSdBrkFace196.0GdTAPConcGdTANoGLQ706.0Unf0.0150.0856.0GasAExYSBrkr856854017101.00.02131Gd8Typ0NaNAttchd2003.0RFn2.0548.0TATAY0610000022008WDNormal208500.0
1220RL80.09600PaveRegLvlAllPubFR2GtlVeenkerFeedrNorm1Fam1Story6819761976GableCompShgMetalSdMetalSdNone0.0TATACBlockGdTAGdALQ978.0Unf0.0284.01262.0GasAExYSBrkr12620012620.01.02031TA6Typ1TAAttchd1976.0RFn2.0460.0TATAY29800000052007WDNormal181500.0
2360RL68.011250PaveIR1LvlAllPubInsideGtlCollgCrNormNorm1Fam2Story7520012002GableCompShgVinylSdVinylSdBrkFace162.0GdTAPConcGdTAMnGLQ486.0Unf0.0434.0920.0GasAExYSBrkr920866017861.00.02131Gd6Typ1TAAttchd2001.0RFn2.0608.0TATAY0420000092008WDNormal223500.0
3470RL60.09550PaveIR1LvlAllPubCornerGtlCrawforNormNorm1Fam2Story7519151970GableCompShgWd SdngWd ShngNone0.0TATABrkTilTAGdNoALQ216.0Unf0.0540.0756.0GasAGdYSBrkr961756017171.00.01031Gd7Typ1GdDetchd1998.0Unf3.0642.0TATAY035272000022006WDAbnorml140000.0
4560RL84.014260PaveIR1LvlAllPubFR2GtlNoRidgeNormNorm1Fam2Story8520002000GableCompShgVinylSdVinylSdBrkFace350.0GdTAPConcGdTAAvGLQ655.0Unf0.0490.01145.0GasAExYSBrkr11451053021981.00.02141Gd9Typ1TAAttchd2000.0RFn3.0836.0TATAY1928400000122008WDNormal250000.0

类别数据处理


序号编码

通常用来处理类别间具有大小关系的数据,比如成绩(高中低)

独热编码

通常用于处理类别间不具有大小关系的特征,比如血型(A型血、B型血、AB型血、O型血)

提示

  • (1)在独热编码下,特征向量只有某一维取值为1,其余值均为0,因此可以利用向量的稀疏来节省空间
  • (2)如果类别型的唯一类别元素较多,可能会造成维度灾难,因此需要利用特征选择来降低维度。
import copy
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
def data_class_processing(cls, data, columns):
    for column in columns:
        if cls == 'ohe':
            ohe_data = pd.get_dummies(data[column], prefix=column)
            data.drop(column, axis=1, inplace=True)
            data = pd.concat([ohe_data, data], axis=1)
        if cls == 'label':
            le = LabelEncoder()
            data[column] = le.fit_transform(data[column])
    return data

columns = [
    'MSSubClass', 'MSZoning', 'Street', 'LotShape', 'LandContour',
    'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
    'Condition2', 'BldgType', 'HouseStyle', 
    'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
    'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
    'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC',
    'Electrical', 
    'KitchenQual', 
    'Functional', 'FireplaceQu', 'GarageType',
    'GarageFinish', 'GarageQual', 'GarageCond',
    'PavedDrive', 
    'YrSold', 'SaleType', 'SaleCondition'        
]
data = data_class_processing('ohe',data, columns)
# CentralAir	
data['CentralAir'] = data['CentralAir'].map(lambda x: 1 if x == 'Y' else 0)
data.shape
(2919, 295)
data.head()
SaleCondition_AbnormlSaleCondition_AdjLandSaleCondition_AllocaSaleCondition_FamilySaleCondition_NormalSaleCondition_PartialSaleType_CODSaleType_CWDSaleType_ConSaleType_ConLDSaleType_ConLISaleType_ConLwSaleType_NewSaleType_OthSaleType_WDYrSold_2006YrSold_2007YrSold_2008YrSold_2009YrSold_2010PavedDrive_NPavedDrive_PPavedDrive_YGarageCond_ExGarageCond_FaGarageCond_GdGarageCond_PoGarageCond_TAGarageQual_ExGarageQual_FaGarageQual_GdGarageQual_PoGarageQual_TAGarageFinish_FinGarageFinish_RFnGarageFinish_UnfGarageType_2TypesGarageType_AttchdGarageType_BasmentGarageType_BuiltInGarageType_CarPortGarageType_DetchdFireplaceQu_ExFireplaceQu_FaFireplaceQu_GdFireplaceQu_PoFireplaceQu_TAFunctional_Maj1Functional_Maj2Functional_Min1Functional_Min2Functional_ModFunctional_SevFunctional_TypKitchenQual_ExKitchenQual_FaKitchenQual_GdKitchenQual_TAElectrical_FuseAElectrical_FuseFElectrical_FusePElectrical_MixElectrical_SBrkrHeatingQC_ExHeatingQC_FaHeatingQC_GdHeatingQC_PoHeatingQC_TAHeating_FloorHeating_GasAHeating_GasWHeating_GravHeating_OthWHeating_WallBsmtFinType2_ALQBsmtFinType2_BLQBsmtFinType2_GLQBsmtFinType2_LwQBsmtFinType2_RecBsmtFinType2_UnfBsmtFinType1_ALQBsmtFinType1_BLQBsmtFinType1_GLQBsmtFinType1_LwQBsmtFinType1_RecBsmtFinType1_UnfBsmtExposure_AvBsmtExposure_GdBsmtExposure_MnBsmtExposure_NoBsmtCond_FaBsmtCond_GdBsmtCond_PoBsmtCond_TABsmtQual_ExBsmtQual_FaBsmtQual_GdBsmtQual_TAFoundation_BrkTilFoundation_CBlockFoundation_PConcFoundation_SlabFoundation_StoneFoundation_WoodExterCond_ExExterCond_FaExterCond_GdExterCond_PoExterCond_TAExterQual_ExExterQual_FaExterQual_GdExterQual_TAMasVnrType_BrkCmnMasVnrType_BrkFaceMasVnrType_NoneMasVnrType_StoneExterior2nd_AsbShngExterior2nd_AsphShnExterior2nd_Brk CmnExterior2nd_BrkFaceExterior2nd_CBlockExterior2nd_CmentBdExterior2nd_HdBoardExterior2nd_ImStuccExterior2nd_MetalSdExterior2nd_OtherExterior2nd_PlywoodExterior2nd_StoneExterior2nd_StuccoExterior2nd_VinylSdExterior2nd_Wd SdngExterior2nd_Wd ShngExterior1st_AsbShngExterior1st_AsphShnExterior1st_BrkCommExterior1st_BrkFaceExterior1st_CBlockExterior1st_CemntBdExterior1st_HdBoardExterior1st_ImStuccExterior1st_MetalSdExterior1st_PlywoodExterior1st_StoneExterior1st_StuccoExterior1st_VinylSdExterior1st_Wd SdngExterior1st_WdShingRoofMatl_ClyTileRoofMatl_CompShgRoofMatl_MembranRoofMatl_MetalRoofMatl_RollRoofMatl_Tar&GrvRoofMatl_WdShakeRoofMatl_WdShnglRoofStyle_FlatRoofStyle_GableRoofStyle_GambrelRoofStyle_HipRoofStyle_MansardRoofStyle_ShedHouseStyle_1.5FinHouseStyle_1.5UnfHouseStyle_1StoryHouseStyle_2.5FinHouseStyle_2.5UnfHouseStyle_2StoryHouseStyle_SFoyerHouseStyle_SLvlBldgType_1FamBldgType_2fmConBldgType_DuplexBldgType_TwnhsBldgType_TwnhsECondition2_ArteryCondition2_FeedrCondition2_NormCondition2_PosACondition2_PosNCondition2_RRAeCondition2_RRAnCondition2_RRNnCondition1_ArteryCondition1_FeedrCondition1_NormCondition1_PosACondition1_PosNCondition1_RRAeCondition1_RRAnCondition1_RRNeCondition1_RRNnNeighborhood_BlmngtnNeighborhood_BluesteNeighborhood_BrDaleNeighborhood_BrkSideNeighborhood_ClearCrNeighborhood_CollgCrNeighborhood_CrawforNeighborhood_EdwardsNeighborhood_GilbertNeighborhood_IDOTRRNeighborhood_MeadowVNeighborhood_MitchelNeighborhood_NAmesNeighborhood_NPkVillNeighborhood_NWAmesNeighborhood_NoRidgeNeighborhood_NridgHtNeighborhood_OldTownNeighborhood_SWISUNeighborhood_SawyerNeighborhood_SawyerWNeighborhood_SomerstNeighborhood_StoneBrNeighborhood_TimberNeighborhood_VeenkerLandSlope_GtlLandSlope_ModLandSlope_SevLotConfig_CornerLotConfig_CulDSacLotConfig_FR2LotConfig_FR3LotConfig_InsideUtilities_AllPubUtilities_NoSeWaLandContour_BnkLandContour_HLSLandContour_LowLandContour_LvlLotShape_IR1LotShape_IR2LotShape_IR3LotShape_RegStreet_GrvlStreet_PaveMSZoning_C (all)MSZoning_FVMSZoning_RHMSZoning_RLMSZoning_RMMSSubClass_20MSSubClass_30MSSubClass_40MSSubClass_45MSSubClass_50MSSubClass_60MSSubClass_70MSSubClass_75MSSubClass_80MSSubClass_85MSSubClass_90MSSubClass_120MSSubClass_150MSSubClass_160MSSubClass_180MSSubClass_190IdLotFrontageLotAreaOverallQualOverallCondYearBuiltYearRemodAddMasVnrAreaBsmtFinSF1BsmtFinSF2BsmtUnfSFTotalBsmtSFCentralAir1stFlrSF2ndFlrSFLowQualFinSFGrLivAreaBsmtFullBathBsmtHalfBathFullBathHalfBathBedroomAbvGrKitchenAbvGrTotRmsAbvGrdFireplacesGarageYrBltGarageCarsGarageAreaWoodDeckSFOpenPorchSFEnclosedPorch3SsnPorchScreenPorchPoolAreaMiscValMoSoldSalePrice
0000010000000001001000010000100001010010000000000000001001000001100000100000000010010000001000100100010000000100100100000000000000010000000000000010001000000010000000001001000000100000001000000000001000000000000000000010000001100001000101000100000010000000000165.084507520032003196.0706.00.0150.0856.01856854017101.00.02131802003.02.0548.0061000002208500.0
1000010000000001010000010000100001010010000000010000001000100001100000100000000011000000100000100100100000000100010010000000001000000000000000100000001000000010000001000001000000100000010000000000000000000000000000000110000100100001000101000101000000000000000280.0960068197619760.0978.00.0284.01262.0112620012620.01.02031611976.02.0460.02980000005181500.0
2000010000000001001000010000100001010010000000010000001001000001100000100000000010010000010000100100010000000100100100000000000000010000000000000010001000000010000000001001000000100000001000000000001000000000000000000010000001100001100001000100000010000000000368.0112507520012002162.0486.00.0434.0920.01920866017861.00.02131612001.02.0608.0042000009223500.0
3100000000000001100000010000100001001000001001000000001001000001001000100000000011000000001010000011000000000100010010000000000000000100000000000001001000000010000000001001000000100000001000000000000100000000000000000010010000100001100001000100000001000000000460.0955075191519700.0216.00.0540.0756.01961756017171.00.01031711998.03.0642.003527200002140000.0
4000010000000001001000010000100001010010000000010000001001000001100000100000000010010001000000100100010000000100100100000000000000010000000000000010001000000010000000001001000000100000001000000000000000000000100000000010000100100001100001000100000010000000000584.0142608520002000350.0655.00.0490.01145.0111451053021981.00.02141912000.03.0836.0192840000012250000.0

缺失值处理

missing_data(data)
TotalPercent
SalePrice145949.982871
LotFrontage48616.649538
GarageYrBlt1595.447071
MasVnrArea230.787941
BsmtFullBath20.068517
BsmtHalfBath20.068517
GarageCars10.034258
BsmtFinSF210.034258
BsmtFinSF110.034258
BsmtUnfSF10.034258
TotalBsmtSF10.034258
GarageArea10.034258
BsmtQual_Gd00.000000
BsmtQual_TA00.000000
Foundation_BrkTil00.000000
BsmtQual_Ex00.000000
BsmtCond_TA00.000000
BsmtCond_Po00.000000
Foundation_CBlock00.000000
Foundation_PConc00.000000
BsmtCond_Gd00.000000
BsmtCond_Fa00.000000
Foundation_Slab00.000000
Foundation_Stone00.000000
Foundation_Wood00.000000
BsmtQual_Fa00.000000
BsmtExposure_Mn00.000000
BsmtExposure_No00.000000
BsmtFinType2_Unf00.000000
Heating_OthW00.000000
Heating_Wall00.000000
BsmtFinType2_ALQ00.000000
BsmtFinType2_BLQ00.000000
BsmtFinType2_GLQ00.000000
BsmtFinType2_LwQ00.000000
BsmtFinType2_Rec00.000000
BsmtFinType1_ALQ00.000000
ExterCond_Fa00.000000
BsmtFinType1_BLQ00.000000
BsmtFinType1_GLQ00.000000
BsmtFinType1_LwQ00.000000
BsmtFinType1_Rec00.000000
BsmtFinType1_Unf00.000000
BsmtExposure_Av00.000000
BsmtExposure_Gd00.000000
ExterCond_Ex00.000000
ExterQual_Gd00.000000
ExterCond_Gd00.000000
Exterior2nd_Wd Shng00.000000
Exterior2nd_Other00.000000
Exterior2nd_Plywood00.000000
Exterior2nd_Stone00.000000
Exterior2nd_Stucco00.000000
Exterior2nd_VinylSd00.000000
Exterior2nd_Wd Sdng00.000000
Exterior1st_AsbShng00.000000
Exterior2nd_ImStucc00.000000
Exterior1st_AsphShn00.000000
Exterior1st_BrkComm00.000000
Exterior1st_BrkFace00.000000
Exterior1st_CBlock00.000000
Exterior1st_CemntBd00.000000
Exterior1st_HdBoard00.000000
Exterior2nd_MetalSd00.000000
Exterior2nd_HdBoard00.000000
ExterCond_Po00.000000
MasVnrType_BrkFace00.000000
ExterCond_TA00.000000
ExterQual_Ex00.000000
ExterQual_Fa00.000000
Heating_GasW00.000000
ExterQual_TA00.000000
MasVnrType_BrkCmn00.000000
MasVnrType_None00.000000
Exterior2nd_CmentBd00.000000
MasVnrType_Stone00.000000
Exterior2nd_AsbShng00.000000
Exterior2nd_AsphShn00.000000
Exterior2nd_Brk Cmn00.000000
Exterior2nd_BrkFace00.000000
Exterior2nd_CBlock00.000000
Heating_Grav00.000000
HeatingQC_Fa00.000000
Heating_GasA00.000000
GarageCond_Gd00.000000
YrSold_201000.000000
PavedDrive_N00.000000
PavedDrive_P00.000000
PavedDrive_Y00.000000
GarageCond_Ex00.000000
GarageCond_Fa00.000000
GarageCond_Po00.000000
YrSold_200800.000000
GarageCond_TA00.000000
GarageQual_Ex00.000000
GarageQual_Fa00.000000
GarageQual_Gd00.000000
GarageQual_Po00.000000
GarageQual_TA00.000000
YrSold_200900.000000
YrSold_200700.000000
Heating_Floor00.000000
SaleType_CWD00.000000
SaleCondition_AdjLand00.000000
SaleCondition_Alloca00.000000
SaleCondition_Family00.000000
SaleCondition_Normal00.000000
SaleCondition_Partial00.000000
SaleType_COD00.000000
SaleType_Con00.000000
YrSold_200600.000000
SaleType_ConLD00.000000
SaleType_ConLI00.000000
SaleType_ConLw00.000000
SaleType_New00.000000
SaleType_Oth00.000000
SaleType_WD00.000000
GarageFinish_Fin00.000000
GarageFinish_RFn00.000000
GarageFinish_Unf00.000000
Electrical_FuseP00.000000
KitchenQual_Ex00.000000
KitchenQual_Fa00.000000
KitchenQual_Gd00.000000
KitchenQual_TA00.000000
Electrical_FuseA00.000000
Electrical_FuseF00.000000
Electrical_Mix00.000000
GarageType_2Types00.000000
Electrical_SBrkr00.000000
HeatingQC_Ex00.000000
Exterior1st_MetalSd00.000000
HeatingQC_Gd00.000000
HeatingQC_Po00.000000
HeatingQC_TA00.000000
Functional_Typ00.000000
Functional_Sev00.000000
Functional_Mod00.000000
Functional_Min200.000000
Functional_Min100.000000
Functional_Maj200.000000
Functional_Maj100.000000
FireplaceQu_TA00.000000
FireplaceQu_Po00.000000
FireplaceQu_Gd00.000000
FireplaceQu_Fa00.000000
FireplaceQu_Ex00.000000
GarageType_Detchd00.000000
GarageType_CarPort00.000000
GarageType_BuiltIn00.000000
GarageType_Basment00.000000
GarageType_Attchd00.000000
Exterior1st_ImStucc00.000000
Exterior1st_WdShing00.000000
Exterior1st_Plywood00.000000
MSZoning_RH00.000000
LotShape_IR300.000000
LotShape_Reg00.000000
Street_Grvl00.000000
Street_Pave00.000000
MSZoning_C (all)00.000000
MSZoning_FV00.000000
MSZoning_RL00.000000
LotShape_IR100.000000
MSZoning_RM00.000000
MSSubClass_2000.000000
MSSubClass_3000.000000
MSSubClass_4000.000000
MSSubClass_4500.000000
MSSubClass_5000.000000
LotShape_IR200.000000
LandContour_Lvl00.000000
Neighborhood_Somerst00.000000
LotConfig_CulDSac00.000000
Neighborhood_Timber00.000000
Neighborhood_Veenker00.000000
LandSlope_Gtl00.000000
LandSlope_Mod00.000000
LandSlope_Sev00.000000
LotConfig_Corner00.000000
LotConfig_FR200.000000
LandContour_Low00.000000
LotConfig_FR300.000000
LotConfig_Inside00.000000
Utilities_AllPub00.000000
Utilities_NoSeWa00.000000
LandContour_Bnk00.000000
LandContour_HLS00.000000
MSSubClass_6000.000000
MSSubClass_7000.000000
MSSubClass_7500.000000
Fireplaces00.000000
GrLivArea00.000000
FullBath00.000000
HalfBath00.000000
BedroomAbvGr00.000000
KitchenAbvGr00.000000
TotRmsAbvGrd00.000000
WoodDeckSF00.000000
MSSubClass_8000.000000
OpenPorchSF00.000000
EnclosedPorch00.000000
3SsnPorch00.000000
ScreenPorch00.000000
PoolArea00.000000
MiscVal00.000000
LowQualFinSF00.000000
2ndFlrSF00.000000
1stFlrSF00.000000
CentralAir00.000000
YearRemodAdd00.000000
YearBuilt00.000000
OverallCond00.000000
OverallQual00.000000
LotArea00.000000
Id00.000000
MSSubClass_19000.000000
MSSubClass_18000.000000
MSSubClass_16000.000000
MSSubClass_15000.000000
MSSubClass_12000.000000
MSSubClass_9000.000000
MSSubClass_8500.000000
Neighborhood_StoneBr00.000000
Neighborhood_SawyerW00.000000
Exterior1st_Stone00.000000
HouseStyle_SFoyer00.000000
HouseStyle_1.5Fin00.000000
HouseStyle_1.5Unf00.000000
HouseStyle_1Story00.000000
HouseStyle_2.5Fin00.000000
HouseStyle_2.5Unf00.000000
HouseStyle_2Story00.000000
HouseStyle_SLvl00.000000
RoofStyle_Mansard00.000000
BldgType_1Fam00.000000
BldgType_2fmCon00.000000
BldgType_Duplex00.000000
BldgType_Twnhs00.000000
BldgType_TwnhsE00.000000
Condition2_Artery00.000000
RoofStyle_Shed00.000000
RoofStyle_Hip00.000000
Neighborhood_Sawyer00.000000
RoofMatl_Membran00.000000
Exterior1st_Stucco00.000000
Exterior1st_VinylSd00.000000
Exterior1st_Wd Sdng00.000000
MoSold00.000000
RoofMatl_ClyTile00.000000
RoofMatl_CompShg00.000000
RoofMatl_Metal00.000000
RoofStyle_Gambrel00.000000
RoofMatl_Roll00.000000
RoofMatl_Tar&Grv00.000000
RoofMatl_WdShake00.000000
RoofMatl_WdShngl00.000000
RoofStyle_Flat00.000000
RoofStyle_Gable00.000000
Condition2_Feedr00.000000
Condition2_Norm00.000000
Condition2_PosA00.000000
Neighborhood_Mitchel00.000000
Neighborhood_CollgCr00.000000
Neighborhood_Crawfor00.000000
Neighborhood_Edwards00.000000
Neighborhood_Gilbert00.000000
Neighborhood_IDOTRR00.000000
Neighborhood_MeadowV00.000000
Neighborhood_NAmes00.000000
Condition2_PosN00.000000
Neighborhood_NPkVill00.000000
Neighborhood_NWAmes00.000000
Neighborhood_NoRidge00.000000
Neighborhood_NridgHt00.000000
Neighborhood_OldTown00.000000
Neighborhood_SWISU00.000000
Neighborhood_ClearCr00.000000
Neighborhood_BrkSide00.000000
Neighborhood_BrDale00.000000
Neighborhood_Blueste00.000000
Neighborhood_Blmngtn00.000000
Condition1_RRNn00.000000
Condition1_RRNe00.000000
Condition1_RRAn00.000000
Condition1_RRAe00.000000
Condition1_PosN00.000000
Condition1_PosA00.000000
Condition1_Norm00.000000
Condition1_Feedr00.000000
Condition1_Artery00.000000
Condition2_RRNn00.000000
Condition2_RRAn00.000000
Condition2_RRAe00.000000
SaleCondition_Abnorml00.000000
# 众数填充
def mode_fill(df,columns):
    for col in columns:
        if df[col].isnull().sum() > 0:  # 有缺失值就进行众数填充
            print(df[col].mode()[0])
            df[col].fillna(df[col].mode()[0], inplace=True)
    return df

columns = ['LotFrontage', 'GarageYrBlt', 'MasVnrArea','BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'BsmtFinSF2',
        'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', 'GarageArea']
data = mode_fill(data,columns)
60.0
2005.0
0.0
0.0
0.0
2.0
0.0
0.0
0.0
0.0
0.0
data.shape
(2919, 295)
data.head()
SaleCondition_AbnormlSaleCondition_AdjLandSaleCondition_AllocaSaleCondition_FamilySaleCondition_NormalSaleCondition_PartialSaleType_CODSaleType_CWDSaleType_ConSaleType_ConLDSaleType_ConLISaleType_ConLwSaleType_NewSaleType_OthSaleType_WDYrSold_2006YrSold_2007YrSold_2008YrSold_2009YrSold_2010PavedDrive_NPavedDrive_PPavedDrive_YGarageCond_ExGarageCond_FaGarageCond_GdGarageCond_PoGarageCond_TAGarageQual_ExGarageQual_FaGarageQual_GdGarageQual_PoGarageQual_TAGarageFinish_FinGarageFinish_RFnGarageFinish_UnfGarageType_2TypesGarageType_AttchdGarageType_BasmentGarageType_BuiltInGarageType_CarPortGarageType_DetchdFireplaceQu_ExFireplaceQu_FaFireplaceQu_GdFireplaceQu_PoFireplaceQu_TAFunctional_Maj1Functional_Maj2Functional_Min1Functional_Min2Functional_ModFunctional_SevFunctional_TypKitchenQual_ExKitchenQual_FaKitchenQual_GdKitchenQual_TAElectrical_FuseAElectrical_FuseFElectrical_FusePElectrical_MixElectrical_SBrkrHeatingQC_ExHeatingQC_FaHeatingQC_GdHeatingQC_PoHeatingQC_TAHeating_FloorHeating_GasAHeating_GasWHeating_GravHeating_OthWHeating_WallBsmtFinType2_ALQBsmtFinType2_BLQBsmtFinType2_GLQBsmtFinType2_LwQBsmtFinType2_RecBsmtFinType2_UnfBsmtFinType1_ALQBsmtFinType1_BLQBsmtFinType1_GLQBsmtFinType1_LwQBsmtFinType1_RecBsmtFinType1_UnfBsmtExposure_AvBsmtExposure_GdBsmtExposure_MnBsmtExposure_NoBsmtCond_FaBsmtCond_GdBsmtCond_PoBsmtCond_TABsmtQual_ExBsmtQual_FaBsmtQual_GdBsmtQual_TAFoundation_BrkTilFoundation_CBlockFoundation_PConcFoundation_SlabFoundation_StoneFoundation_WoodExterCond_ExExterCond_FaExterCond_GdExterCond_PoExterCond_TAExterQual_ExExterQual_FaExterQual_GdExterQual_TAMasVnrType_BrkCmnMasVnrType_BrkFaceMasVnrType_NoneMasVnrType_StoneExterior2nd_AsbShngExterior2nd_AsphShnExterior2nd_Brk CmnExterior2nd_BrkFaceExterior2nd_CBlockExterior2nd_CmentBdExterior2nd_HdBoardExterior2nd_ImStuccExterior2nd_MetalSdExterior2nd_OtherExterior2nd_PlywoodExterior2nd_StoneExterior2nd_StuccoExterior2nd_VinylSdExterior2nd_Wd SdngExterior2nd_Wd ShngExterior1st_AsbShngExterior1st_AsphShnExterior1st_BrkCommExterior1st_BrkFaceExterior1st_CBlockExterior1st_CemntBdExterior1st_HdBoardExterior1st_ImStuccExterior1st_MetalSdExterior1st_PlywoodExterior1st_StoneExterior1st_StuccoExterior1st_VinylSdExterior1st_Wd SdngExterior1st_WdShingRoofMatl_ClyTileRoofMatl_CompShgRoofMatl_MembranRoofMatl_MetalRoofMatl_RollRoofMatl_Tar&GrvRoofMatl_WdShakeRoofMatl_WdShnglRoofStyle_FlatRoofStyle_GableRoofStyle_GambrelRoofStyle_HipRoofStyle_MansardRoofStyle_ShedHouseStyle_1.5FinHouseStyle_1.5UnfHouseStyle_1StoryHouseStyle_2.5FinHouseStyle_2.5UnfHouseStyle_2StoryHouseStyle_SFoyerHouseStyle_SLvlBldgType_1FamBldgType_2fmConBldgType_DuplexBldgType_TwnhsBldgType_TwnhsECondition2_ArteryCondition2_FeedrCondition2_NormCondition2_PosACondition2_PosNCondition2_RRAeCondition2_RRAnCondition2_RRNnCondition1_ArteryCondition1_FeedrCondition1_NormCondition1_PosACondition1_PosNCondition1_RRAeCondition1_RRAnCondition1_RRNeCondition1_RRNnNeighborhood_BlmngtnNeighborhood_BluesteNeighborhood_BrDaleNeighborhood_BrkSideNeighborhood_ClearCrNeighborhood_CollgCrNeighborhood_CrawforNeighborhood_EdwardsNeighborhood_GilbertNeighborhood_IDOTRRNeighborhood_MeadowVNeighborhood_MitchelNeighborhood_NAmesNeighborhood_NPkVillNeighborhood_NWAmesNeighborhood_NoRidgeNeighborhood_NridgHtNeighborhood_OldTownNeighborhood_SWISUNeighborhood_SawyerNeighborhood_SawyerWNeighborhood_SomerstNeighborhood_StoneBrNeighborhood_TimberNeighborhood_VeenkerLandSlope_GtlLandSlope_ModLandSlope_SevLotConfig_CornerLotConfig_CulDSacLotConfig_FR2LotConfig_FR3LotConfig_InsideUtilities_AllPubUtilities_NoSeWaLandContour_BnkLandContour_HLSLandContour_LowLandContour_LvlLotShape_IR1LotShape_IR2LotShape_IR3LotShape_RegStreet_GrvlStreet_PaveMSZoning_C (all)MSZoning_FVMSZoning_RHMSZoning_RLMSZoning_RMMSSubClass_20MSSubClass_30MSSubClass_40MSSubClass_45MSSubClass_50MSSubClass_60MSSubClass_70MSSubClass_75MSSubClass_80MSSubClass_85MSSubClass_90MSSubClass_120MSSubClass_150MSSubClass_160MSSubClass_180MSSubClass_190IdLotFrontageLotAreaOverallQualOverallCondYearBuiltYearRemodAddMasVnrAreaBsmtFinSF1BsmtFinSF2BsmtUnfSFTotalBsmtSFCentralAir1stFlrSF2ndFlrSFLowQualFinSFGrLivAreaBsmtFullBathBsmtHalfBathFullBathHalfBathBedroomAbvGrKitchenAbvGrTotRmsAbvGrdFireplacesGarageYrBltGarageCarsGarageAreaWoodDeckSFOpenPorchSFEnclosedPorch3SsnPorchScreenPorchPoolAreaMiscValMoSoldSalePrice
0000010000000001001000010000100001010010000000000000001001000001100000100000000010010000001000100100010000000100100100000000000000010000000000000010001000000010000000001001000000100000001000000000001000000000000000000010000001100001000101000100000010000000000165.084507520032003196.0706.00.0150.0856.01856854017101.00.02131802003.02.0548.0061000002208500.0
1000010000000001010000010000100001010010000000010000001000100001100000100000000011000000100000100100100000000100010010000000001000000000000000100000001000000010000001000001000000100000010000000000000000000000000000000110000100100001000101000101000000000000000280.0960068197619760.0978.00.0284.01262.0112620012620.01.02031611976.02.0460.02980000005181500.0
2000010000000001001000010000100001010010000000010000001001000001100000100000000010010000010000100100010000000100100100000000000000010000000000000010001000000010000000001001000000100000001000000000001000000000000000000010000001100001100001000100000010000000000368.0112507520012002162.0486.00.0434.0920.01920866017861.00.02131612001.02.0608.0042000009223500.0
3100000000000001100000010000100001001000001001000000001001000001001000100000000011000000001010000011000000000100010010000000000000000100000000000001001000000010000000001001000000100000001000000000000100000000000000000010010000100001100001000100000001000000000460.0955075191519700.0216.00.0540.0756.01961756017171.00.01031711998.03.0642.003527200002140000.0
4000010000000001001000010000100001010010000000010000001001000001100000100000000010010001000000100100010000000100100100000000000000010000000000000010001000000010000000001001000000100000001000000000000000000000100000000010000100100001100001000100000010000000000584.0142608520002000350.0655.00.0490.01145.0111451053021981.00.02141912000.03.0836.0192840000012250000.0
train = data[data['SalePrice'].notnull()]
test = data[data['SalePrice'].isnull()].drop(['SalePrice'],axis=1)
print(train.shape)
print(test.shape)
(1460, 295)
(1459, 294)

score,_ = xgb_eval(train) 

last_score = score
print(last_score)
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    7.7s remaining:   11.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   10.3s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13210981084409507
run time is: 12 秒
0.13210981084409507

离群值查看

箱型图法
import numpy as np

def boxplot(data):
    # 下四分位数值、中位数,上四分位数值
    Q1, median, Q3 = np.percentile(data, (25, 50, 75), interpolation='midpoint')
    # 四分位距
    IQR = Q3 - Q1
    
    # 内限
    inner = [Q1-1.5*IQR, Q3+1.5*IQR]
    # 外限
    outer = [Q1-3.0*IQR, Q3+3.0*IQR]
#     print('>>>内限:', inner)
#     print('>>>外限:', outer)
    
    # 过滤掉极端异常值
#     print(len(data))
    goodData = []
    for value in data:
        if (value < outer[1]) and (value > outer[0]):
            goodData.append(value)
    print(f"异常值个数:{len(data)-len(goodData)}")
    
#     return f"异常值个数:{len(data)-len(goodData)}"


columns = [
    'LotFrontage','LotArea','MasVnrArea','BsmtFinSF1',
    'BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','1stFlrSF',
    '2ndFlrSF','LowQualFinSF','GrLivArea' ,'GarageArea',
    'WoodDeckSF','OpenPorchSF','EnclosedPorch',
    '3SsnPorch','ScreenPorch','PoolArea','MiscVal'
]
for col in columns:
    print(col)
    boxplot(train[col])
LotFrontage
异常值个数:16
LotArea
异常值个数:34
MasVnrArea
异常值个数:28
BsmtFinSF1
异常值个数:1
BsmtFinSF2
异常值个数:1460
BsmtUnfSF
异常值个数:0
TotalBsmtSF
异常值个数:5
1stFlrSF
异常值个数:3
2ndFlrSF
异常值个数:0
LowQualFinSF
异常值个数:1460
GrLivArea
异常值个数:4
GarageArea
异常值个数:3
WoodDeckSF
异常值个数:3
OpenPorchSF
异常值个数:18
EnclosedPorch
异常值个数:1460
3SsnPorch
异常值个数:1460
ScreenPorch
异常值个数:1460
PoolArea
异常值个数:1460
MiscVal
异常值个数:1460

先未作处理



无量纲化(xgboost不需要)

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler

def nondimensionalized(cls, data, columns):
    def sigmoid(df):
        x_sigmoid = []
        for item in columns:
            S = 1/(1+np.exp(-df[item]))
            x_sigmoid.append(S)
        return np.array(np.matrix(x_sigmoid).T)
    def feature_importance(df):
        x_sum_scaler = []
        for item in columns:
            S = np.sum(df[item])
            FI = df[item]/S
            x_sum_scaler.append(FI)
        return np.array(np.matrix(x_sum_scaler).T)
    
        
    if cls == 'minmax': # 区间缩放法-极差标准化
        mm = MinMaxScaler()
        data[columns] = mm.fit_transform(data.loc[:,columns])
        
    if cls == 'maxabs': # 极大值标准化
        ma = MaxAbsScaler()
        data[columns] = ma.fit_transform(data.loc[:,columns])
        
    if cls == 'zscore':
        ss = StandardScaler()
        data[columns] = ss.fit_transform(data.loc[:,columns])
    
    if cls == 'feature_importance':
        feature_importance(data)
        
    if cls == 'sigmoid':
        data[columns] = sigmoid(data)
    return data

无监督离散化之分箱法

# 日期
# 19.YearBuilt, 112
# 20.YearRemodAdd  61 
# 59.GarageYrBlt 98
# 等宽分箱
# cut将根据值本身来选择箱子均匀间隔,即每个箱子的间距都是相同的

columns = ['YearBuilt','YearRemodAdd', 'GarageYrBlt']
best_k = [0, 0, 0]
for col in columns:
    for k in range(2, 30):
        data_tmp = data.copy()
        data_tmp[col] = pd.cut(data_tmp[col],k,labels=False)

        train = data_tmp[data_tmp['SalePrice'].notnull()]
        test = data_tmp[data_tmp['SalePrice'].isnull()].drop(['SalePrice'],axis=1)
        score,_ = xgb_eval(train) 
        if score < last_score:
            last_score = score
            data = data_tmp 
            best_k[columns.index(col)] = k
print('********') 
print(best_k)
print(last_score)
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.9s remaining:    7.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.6s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1325314918976797
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.8s remaining:    7.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.4s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.9s remaining:    7.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.5s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    5.2s remaining:    7.9s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.8s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.1s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.9s remaining:    7.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.8s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.8s remaining:    7.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.5s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13128651826155793
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13261864972243745
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13344455220740342
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13307043441098165
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1314590441980213
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.133694047769719
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13192230270881222
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13292568537231955
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.4s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13398849705479246
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1318105711452387
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.4s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13217414742944494
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1319704816242039
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.8s remaining:    7.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13324929266598454
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1309254483960009
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1309254483960009
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13092695074120808
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13092632356109551
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13093923391375495
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1309254483960009
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13094010918297302
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    5.0s remaining:    7.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.6s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1309254483960009
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13094073625896213
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.8s remaining:    7.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13092632356109551
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13093923391375495
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.8s remaining:    7.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1309254483960009
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1309254483960009
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1309254483960009
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1309254483960009
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13092695074120808
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1316914528478866
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1317691503724471
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13155167828308173
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13133331798520795
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.4s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13186260953000736
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13142461820170612
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13145574635885626
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1318834661627718
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1311596674312157
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13096455806523824
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13119017558334428
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13129433089210113
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13121513728294185
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13208848359797648
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13136700646533622
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.8s remaining:    7.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.4s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1316569403135553
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13120202438894077
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1318657365901918
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    5.1s remaining:    7.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.6s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13182170074446148
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13133005698117176
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.9s remaining:    7.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.5s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1314027454132637
run time is: 9 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13164172403740695
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13155482094502888
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13269396984932624
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13210424560599787
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13141174441143938
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.13141970000036757
run time is: 8 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.7s remaining:    7.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.2s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.1319578199368307
run time is: 8 秒
********
[3, 14, 0]
0.1309254483960009
# [1, 7, 0]
# 0.13132731648769486

# [3, 14, 0]
# 0.1309254483960009

特征构造

def xgb_feature_importance_topk(data,k):
    data_tmp = data.copy()
    train = data_tmp[data_tmp['SalePrice'].notnull()]
    X=train.drop(['SalePrice'],axis=1)
    Y=train.loc[:,'SalePrice'].values

    
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=666)

    model = xgb.XGBRegressor(
        n_jobs=-1,
        random_state=666
    )
    
    param_grid = {
        'learning_rate': [0.1],
        'n_estimators': [100]
    }
    gridsearch = GridSearchCV(
        model, 
        param_grid=param_grid, 
        cv=5, 
        verbose=3, 
        n_jobs=-1
    )
    gridsearch.fit(X_train, y_train)

    model = gridsearch.best_estimator_

    '''
    feature_importances_方法以特征馈送给算法的顺序返回相对重要性数字。因此,为了获得前20名的功能,你会想从最到最不重要的功能,例如像这样进行排序:
    '''
    importances = model.feature_importances_ 
    indices = np.argsort(-importances)[:k] 
    columns = train.iloc[:,indices].columns.tolist()
    res = []
    for col in columns:
        for feat, importance in zip(train.columns, model.feature_importances_): 
            if col == feat:
                res.append((col,importance))
        
    return res
len(data.columns)
295
# '''
# 以获得每个功能名称的重要性,只是通过列名迭代和feature_importances在一起(它们相互映射):
# '''
# for feat, importance in zip(train.columns, model.feature_importances_): 
#     print( 'feature: {f}, importance: {i}'.format(f=feat, i=importance) )
xgb_feature_importance_topk(data,295)
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    3.3s remaining:    5.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.0s finished





[('OverallQual', 0.4092757),
 ('GrLivArea', 0.042585827),
 ('BsmtQual_Ex', 0.03662632),
 ('RoofMatl_ClyTile', 0.035097126),
 ('GarageCars', 0.025553642),
 ('CentralAir', 0.022178357),
 ('KitchenQual_TA', 0.020976413),
 ('Exterior1st_Stucco', 0.02032021),
 ('TotalBsmtSF', 0.01820988),
 ('MSSubClass_60', 0.016657786),
 ('1stFlrSF', 0.015945055),
 ('KitchenAbvGr', 0.0152769955),
 ('BsmtFinSF1', 0.014164196),
 ('Exterior2nd_Stucco', 0.0121664),
 ('2ndFlrSF', 0.011771718),
 ('TotRmsAbvGrd', 0.00974979),
 ('KitchenQual_Ex', 0.0096869),
 ('BsmtFinType1_GLQ', 0.008715735),
 ('MSZoning_RM', 0.0077215075),
 ('BsmtQual_Gd', 0.0075940914),
 ('ExterQual_Ex', 0.007450104),
 ('GarageQual_TA', 0.0070142536),
 ('Exterior1st_AsbShng', 0.006995251),
 ('Exterior2nd_Brk Cmn', 0.0065990016),
 ('LotShape_Reg', 0.0064700344),
 ('Fireplaces', 0.0063608997),
 ('KitchenQual_Gd', 0.0062793503),
 ('MSSubClass_30', 0.0060889646),
 ('GarageArea', 0.005154548),
 ('YearRemodAdd', 0.005096418),
 ('SaleType_New', 0.0050593144),
 ('GarageType_Attchd', 0.0045187445),
 ('FireplaceQu_Fa', 0.004383841),
 ('ExterQual_Fa', 0.0037600468),
 ('BsmtFinType2_LwQ', 0.003100799),
 ('LotArea', 0.0029817864),
 ('Electrical_FuseF', 0.0029267112),
 ('BldgType_Duplex', 0.0029150224),
 ('Neighborhood_SWISU', 0.0027692046),
 ('RoofMatl_WdShngl', 0.0025965958),
 ('Heating_Grav', 0.0025687595),
 ('LandSlope_Gtl', 0.0025014824),
 ('LandSlope_Mod', 0.0024780459),
 ('FullBath', 0.0024371848),
 ('BldgType_1Fam', 0.0023931647),
 ('Foundation_Stone', 0.0022776753),
 ('MSSubClass_75', 0.0022572486),
 ('MasVnrArea', 0.0022140164),
 ('SaleCondition_Family', 0.002167297),
 ('YrSold_2008', 0.0021614458),
 ('GarageType_Detchd', 0.0021610886),
 ('LandContour_HLS', 0.002159908),
 ('SaleType_WD', 0.002155278),
 ('Neighborhood_Crawfor', 0.002129462),
 ('Neighborhood_Edwards', 0.0020690016),
 ('HalfBath', 0.0019449288),
 ('LandContour_Lvl', 0.0018609057),
 ('Functional_Mod', 0.0018403352),
 ('ExterCond_Fa', 0.001830228),
 ('MasVnrType_BrkFace', 0.001814074),
 ('LotFrontage', 0.0017950683),
 ('ExterQual_Gd', 0.0017648138),
 ('Condition1_Feedr', 0.0017304313),
 ('SaleCondition_Partial', 0.0017095393),
 ('Condition1_RRAe', 0.0017053399),
 ('MSSubClass_20', 0.0016536457),
 ('HeatingQC_Fa', 0.0016335138),
 ('ExterQual_TA', 0.0016250247),
 ('OverallCond', 0.0016106549),
 ('Neighborhood_NAmes', 0.0016082175),
 ('BsmtQual_Fa', 0.0015694612),
 ('Exterior1st_BrkFace', 0.0015455327),
 ('Neighborhood_StoneBr', 0.0015435361),
 ('BsmtFullBath', 0.0014969024),
 ('BsmtExposure_No', 0.0014912919),
 ('Functional_Maj2', 0.0014770095),
 ('Heating_OthW', 0.0014321045),
 ('Neighborhood_Veenker', 0.00142436),
 ('Functional_Typ', 0.0013758234),
 ('Neighborhood_OldTown', 0.001347905),
 ('PavedDrive_Y', 0.001347674),
 ('EnclosedPorch', 0.00134469),
 ('Neighborhood_Sawyer', 0.0012952455),
 ('GarageYrBlt', 0.0012869029),
 ('RoofMatl_CompShg', 0.0011996817),
 ('FireplaceQu_TA', 0.001159448),
 ('MSZoning_RL', 0.0011549005),
 ('PavedDrive_N', 0.001129404),
 ('RoofStyle_Shed', 0.0011210005),
 ('BsmtExposure_Gd', 0.0010897304),
 ('BedroomAbvGr', 0.0010711068),
 ('BsmtCond_Fa', 0.0010670887),
 ('Neighborhood_IDOTRR', 0.0010394471),
 ('MasVnrType_BrkCmn', 0.0010386847),
 ('KitchenQual_Fa', 0.0010143467),
 ('OpenPorchSF', 0.0010134919),
 ('SaleCondition_Abnorml', 0.000997769),
 ('WoodDeckSF', 0.0009796731),
 ('LandSlope_Sev', 0.00096981646),
 ('MSZoning_C (all)', 0.00096615497),
 ('HouseStyle_SLvl', 0.00096104806),
 ('LandContour_Low', 0.00095283776),
 ('Exterior1st_Wd Sdng', 0.00093908736),
 ('BsmtCond_Gd', 0.00092369085),
 ('BsmtUnfSF', 0.0009159962),
 ('BsmtFinSF2', 0.000914709),
 ('Exterior2nd_Plywood', 0.0009035254),
 ('Neighborhood_CollgCr', 0.0008786999),
 ('BsmtQual_TA', 0.00087709253),
 ('PavedDrive_P', 0.0008450014),
 ('BsmtFinType1_Rec', 0.00083076884),
 ('LotConfig_CulDSac', 0.00082237995),
 ('Condition1_Artery', 0.00079686724),
 ('Id', 0.0007814562),
 ('Neighborhood_Timber', 0.0007689896),
 ('MoSold', 0.0007688444),
 ('HeatingQC_TA', 0.0007654603),
 ('Exterior2nd_MetalSd', 0.0007561838),
 ('GarageFinish_Fin', 0.0007506564),
 ('FireplaceQu_Gd', 0.0007478747),
 ('Neighborhood_Somerst', 0.00073150184),
 ('ScreenPorch', 0.00071018364),
 ('Exterior2nd_AsbShng', 0.00069117936),
 ('BldgType_2fmCon', 0.00068497774),
 ('Exterior2nd_Wd Sdng', 0.0006789549),
 ('GarageQual_Fa', 0.0006756631),
 ('Exterior2nd_ImStucc', 0.0006481715),
 ('BsmtFinType1_ALQ', 0.0006466766),
 ('HouseStyle_2Story', 0.00063225214),
 ('Exterior1st_WdShing', 0.00061583845),
 ('Exterior2nd_HdBoard', 0.00061482575),
 ('Exterior1st_HdBoard', 0.0006114065),
 ('HeatingQC_Ex', 0.0006030498),
 ('Condition1_PosN', 0.0005937698),
 ('Exterior2nd_VinylSd', 0.00057505973),
 ('FireplaceQu_Ex', 0.0005464077),
 ('BsmtFinType2_Unf', 0.0005114567),
 ('LowQualFinSF', 0.0005015399),
 ('BsmtCond_TA', 0.0004988435),
 ('LotConfig_FR3', 0.00048852694),
 ('Condition1_Norm', 0.00047821572),
 ('MSSubClass_120', 0.00046705888),
 ('SaleType_ConLD', 0.00045726588),
 ('BsmtExposure_Mn', 0.0004539239),
 ('BsmtFinType1_LwQ', 0.00044687753),
 ('GarageFinish_Unf', 0.00043614442),
 ('BsmtFinType2_ALQ', 0.0004010186),
 ('FireplaceQu_Po', 0.00038877616),
 ('LotConfig_FR2', 0.0003666034),
 ('HeatingQC_Gd', 0.0003664517),
 ('YearBuilt', 0.00036322788),
 ('Exterior1st_VinylSd', 0.0003612518),
 ('BsmtExposure_Av', 0.00035657448),
 ('HouseStyle_1Story', 0.00035240524),
 ('Exterior2nd_CmentBd', 0.0003379002),
 ('Neighborhood_BrkSide', 0.00033439198),
 ('SaleCondition_Normal', 0.00033135305),
 ('PoolArea', 0.00032846778),
 ('LotShape_IR1', 0.00031836148),
 ('LotShape_IR2', 0.0003158487),
 ('BsmtFinType1_BLQ', 0.00029770628),
 ('HouseStyle_1.5Fin', 0.0002949982),
 ('ExterCond_Gd', 0.00029466968),
 ('3SsnPorch', 0.00028137083),
 ('BsmtFinType1_Unf', 0.00025151562),
 ('MasVnrType_None', 0.0002474294),
 ('BsmtHalfBath', 0.00024731937),
 ('SaleType_COD', 0.0002446929),
 ('Functional_Min1', 0.00023831776),
 ('BsmtFinType2_GLQ', 0.0002348463),
 ('YrSold_2010', 0.0002297446),
 ('GarageCond_Fa', 0.00022557852),
 ('RoofMatl_Tar&Grv', 0.00020732886),
 ('GarageQual_Gd', 0.00020601533),
 ('LandContour_Bnk', 0.00020408761),
 ('Foundation_PConc', 0.00020276985),
 ('Condition1_RRAn', 0.00020187993),
 ('BsmtFinType2_Rec', 0.00019516733),
 ('GarageType_BuiltIn', 0.00018795398),
 ('LotConfig_Corner', 0.00017717268),
 ('LotConfig_Inside', 0.00017693448),
 ('Neighborhood_ClearCr', 0.00014557355),
 ('YrSold_2007', 0.0001376982),
 ('BldgType_Twnhs', 0.00013715278),
 ('BsmtFinType2_BLQ', 0.00013597694),
 ('Electrical_SBrkr', 0.00013098777),
 ('GarageType_Basment', 0.00013070596),
 ('YrSold_2009', 0.00012943595),
 ('SaleType_ConLI', 0.00012930483),
 ('Foundation_Slab', 0.0001262763),
 ('Electrical_FuseA', 0.00012433954),
 ('Neighborhood_NridgHt', 0.00011382785),
 ('GarageQual_Ex', 0.00010800753),
 ('Foundation_CBlock', 0.00010498048),
 ('Foundation_BrkTil', 8.812534e-05),
 ('GarageFinish_RFn', 7.479326e-05),
 ('Functional_Min2', 7.3838106e-05),
 ('Functional_Maj1', 6.6195884e-05),
 ('MSSubClass_50', 5.7079727e-05),
 ('YrSold_2006', 5.544721e-05),
 ('Exterior2nd_BrkFace', 4.9046714e-05),
 ('GarageCond_Gd', 4.6189987e-05),
 ('GarageCond_TA', 3.649019e-05),
 ('ExterCond_TA', 3.0958196e-05),
 ('RoofStyle_Gable', 1.6125034e-05),
 ('SaleCondition_Alloca', 1.4236363e-05),
 ('SaleType_ConLw', 1.3619546e-05),
 ('RoofStyle_Hip', 9.960351e-06),
 ('MSZoning_RH', 0.0),
 ('GarageCond_Po', 0.0),
 ('GarageCond_Ex', 0.0),
 ('SaleCondition_AdjLand', 0.0),
 ('GarageQual_Po', 0.0),
 ('LotShape_IR3', 0.0),
 ('MSSubClass_190', 0.0),
 ('SaleType_CWD', 0.0),
 ('SaleType_Con', 0.0),
 ('MSSubClass_160', 0.0),
 ('MSSubClass_150', 0.0),
 ('MSSubClass_90', 0.0),
 ('MSSubClass_85', 0.0),
 ('MSSubClass_80', 0.0),
 ('Street_Grvl', 0.0),
 ('MSSubClass_70', 0.0),
 ('Street_Pave', 0.0),
 ('MSSubClass_45', 0.0),
 ('MSSubClass_40', 0.0),
 ('GarageType_2Types', 0.0),
 ('GarageType_CarPort', 0.0),
 ('SaleType_Oth', 0.0),
 ('MSZoning_FV', 0.0),
 ('MSSubClass_180', 0.0),
 ('Foundation_Wood', 0.0),
 ('Utilities_AllPub', 0.0),
 ('HouseStyle_2.5Fin', 0.0),
 ('HouseStyle_1.5Unf', 0.0),
 ('RoofStyle_Mansard', 0.0),
 ('RoofStyle_Gambrel', 0.0),
 ('RoofStyle_Flat', 0.0),
 ('RoofMatl_WdShake', 0.0),
 ('RoofMatl_Roll', 0.0),
 ('RoofMatl_Metal', 0.0),
 ('RoofMatl_Membran', 0.0),
 ('MiscVal', 0.0),
 ('Exterior1st_Stone', 0.0),
 ('Exterior1st_Plywood', 0.0),
 ('Exterior1st_MetalSd', 0.0),
 ('Exterior1st_ImStucc', 0.0),
 ('Exterior1st_CemntBd', 0.0),
 ('Exterior1st_CBlock', 0.0),
 ('Exterior1st_BrkComm', 0.0),
 ('Exterior1st_AsphShn', 0.0),
 ('Exterior2nd_Wd Shng', 0.0),
 ('Exterior2nd_Stone', 0.0),
 ('Exterior2nd_Other', 0.0),
 ('Exterior2nd_CBlock', 0.0),
 ('Exterior2nd_AsphShn', 0.0),
 ('MasVnrType_Stone', 0.0),
 ('BsmtCond_Po', 0.0),
 ('ExterCond_Po', 0.0),
 ('ExterCond_Ex', 0.0),
 ('HouseStyle_2.5Unf', 0.0),
 ('Utilities_NoSeWa', 0.0),
 ('HouseStyle_SFoyer', 0.0),
 ('Condition2_Artery', 0.0),
 ('Functional_Sev', 0.0),
 ('Electrical_FuseP', 0.0),
 ('Electrical_Mix', 0.0),
 ('Neighborhood_SawyerW', 0.0),
 ('Neighborhood_NoRidge', 0.0),
 ('Neighborhood_NWAmes', 0.0),
 ('Neighborhood_NPkVill', 0.0),
 ('HeatingQC_Po', 0.0),
 ('Neighborhood_Mitchel', 0.0),
 ('Neighborhood_MeadowV', 0.0),
 ('Neighborhood_Gilbert', 0.0),
 ('Heating_Floor', 0.0),
 ('Neighborhood_BrDale', 0.0),
 ('Neighborhood_Blueste', 0.0),
 ('Neighborhood_Blmngtn', 0.0),
 ('Condition1_RRNn', 0.0),
 ('Condition1_RRNe', 0.0),
 ('Heating_GasW', 0.0),
 ('Condition1_PosA', 0.0),
 ('Heating_Wall', 0.0),
 ('Condition2_RRNn', 0.0),
 ('Condition2_RRAn', 0.0),
 ('Condition2_RRAe', 0.0),
 ('Condition2_PosN', 0.0),
 ('Condition2_PosA', 0.0),
 ('Condition2_Norm', 0.0),
 ('Condition2_Feedr', 0.0),
 ('BldgType_TwnhsE', 0.0),
 ('Heating_GasA', 0.0)]
OverallQual
  • OverallQual feature_importance最高

17.OverallQual:
Rates the overall material and finish of the house 总体质量:评估房屋的整体材料和装饰

10 Very Excellent
9 Excellent
8 Very Good
7 Good
6 Above Average
5 Average
4 Below Average
3 Fair
2 Poor
1 Very Poor

单变量:
如果某个特征与目标高度相关,那么可以根据具体的情况取这个特征的统计值作为新的特征。

# 计数特征
# 统计单个变量数值次数作为新的特征

data_tmp = data.copy()
new_data = data_tmp.groupby(['OverallQual'])['OverallQual'].count().to_frame().rename(columns={'OverallQual':'OverallQual_count'}).reset_index()
data_tmp = pd.merge(data_tmp, new_data, on=['OverallQual'], how='inner')
print(f"OverallQual_count的唯一数据: {data_tmp['OverallQual_count'].unique()}")
train = data_tmp[data_tmp['SalePrice'].notnull()]
test = data_tmp[data_tmp['SalePrice'].isnull()].drop(['SalePrice'],axis=1)
score,_ = xgb_eval(train) 
if score < last_score:
    print('score:',score)
    last_score = score
    data = data_tmp 

print('********') 
print(last_score)
OverallQual_count的唯一数据: [600 731 342 825 107 226  31  40   4  13]
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.8s remaining:    7.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.3s finished


参数的最佳取值:{'learning_rate': 0.1, 'n_estimators': 100}
最佳模型得分:0.15916946763469966
run time is: 9 秒
********
0.1309254483960009



特征选择

xgboost特征重要性
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.feature_selection import SelectFromModel
def xgb_select_features(data):
    start = datetime.datetime.now()
    
    train_df = copy.deepcopy(data)    
    X=train_df.drop(['SalePrice'],axis=1)
    Y=train_df.loc[:,'SalePrice'].values
    
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=666)

    model = xgb.XGBRegressor(
        n_jobs=-1,
        random_state=666
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = np.sqrt(mean_squared_log_error(y_test, y_pred))
    print('score is: ',score)
    
    thresholds = np.sort(model.feature_importances_)
    print(thresholds)
    
    best_score = last_score
    best_thresh = 0
    best_n = 0
    best_selection = 0
    
    for thresh in thresholds:
        # select features using threshold
        selection = SelectFromModel(model, threshold=thresh, prefit=True)
        select_X_train = selection.transform(X_train)
        selection_model = XGBRegressor()
        selection_model.fit(select_X_train, y_train)
        
        select_X_test = selection.transform(X_test)
        y_pred = selection_model.predict(select_X_test)
        score = np.sqrt(mean_squared_log_error(y_test, y_pred))
        print(f'thresh = {thresh}, n = {select_X_train.shape[1]}, score = {score}')
        if score < best_score:
            best_score = score
            best_thresh = thresh
            best_n = select_X_train.shape[1]
            best_selection = selection
    print('**********')
    print('best_score',best_score)
    print('best_thresh',best_thresh)
    print('best_n',best_n)
    print('best_selection',best_selection)
    
    end = datetime.datetime.now()
    print('run time is:',(end-start).seconds,'秒')
    
    return best_score, best_selection
data_tmp = data.copy()
train = data_tmp[data_tmp['SalePrice'].notnull()]
test = data_tmp[data_tmp['SalePrice'].isnull()].drop(['SalePrice'],axis=1)
score, selection = xgb_select_features(train) 
score is:  0.1634269756512917
[0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 2.21226992e-06 2.89932450e-06 4.01458919e-06
 4.20125571e-06 4.55453346e-06 9.04142507e-06 9.96066228e-06
 1.00254647e-05 1.19890656e-05 1.24969820e-05 1.33192452e-05
 1.50671058e-05 1.57984032e-05 1.64488301e-05 1.65456859e-05
 1.66506961e-05 1.67086491e-05 1.68370625e-05 1.68880233e-05
 2.24718078e-05 2.45410683e-05 2.50417343e-05 2.65526050e-05
 2.66677962e-05 2.81739158e-05 3.15517100e-05 3.20705658e-05
 3.27122434e-05 3.29928953e-05 3.29968752e-05 3.39528415e-05
 3.73345647e-05 4.07940606e-05 4.34545436e-05 4.40089098e-05
 4.59742078e-05 5.13398518e-05 5.29541248e-05 5.51951016e-05
 5.89233132e-05 5.93937548e-05 6.13122611e-05 6.18340200e-05
 6.73107279e-05 6.82628670e-05 6.88516811e-05 7.01651952e-05
 7.58385213e-05 7.66053636e-05 7.91334314e-05 8.01987990e-05
 8.45546747e-05 8.55193794e-05 8.72424353e-05 9.19281811e-05
 9.60359830e-05 1.03984610e-04 1.07962944e-04 1.08941800e-04
 1.13577124e-04 1.22257770e-04 1.34212489e-04 1.49489177e-04
 1.49857922e-04 1.51855944e-04 1.52076711e-04 1.53393194e-04
 1.59260744e-04 1.67861217e-04 1.76806599e-04 1.80713236e-04
 1.82674266e-04 1.85362616e-04 1.86537334e-04 1.89300932e-04
 1.93503409e-04 2.06448414e-04 2.08053942e-04 2.08536789e-04
 2.09548874e-04 2.10550992e-04 2.15660853e-04 2.23734547e-04
 2.26629345e-04 2.27951896e-04 2.30085352e-04 2.36041815e-04
 2.38105218e-04 2.43984279e-04 2.44529539e-04 2.45253090e-04
 2.50702025e-04 2.56657251e-04 2.64317176e-04 2.67900672e-04
 2.71077937e-04 2.87270523e-04 2.95323494e-04 2.96458253e-04
 3.04090703e-04 3.09348950e-04 3.11128359e-04 3.11715499e-04
 3.27558402e-04 3.29893635e-04 3.30773328e-04 3.39805527e-04
 3.46207613e-04 3.50556540e-04 3.55951575e-04 3.66762193e-04
 3.68454348e-04 3.69566202e-04 3.73052113e-04 4.08736407e-04
 4.34989692e-04 4.35515103e-04 4.49951098e-04 4.62014752e-04
 4.70861385e-04 4.75046923e-04 4.77300986e-04 4.92862833e-04
 4.94032749e-04 5.19220601e-04 5.50425262e-04 5.56649757e-04
 5.63154230e-04 5.79251908e-04 5.94773970e-04 6.21029816e-04
 6.29337505e-04 6.31644973e-04 6.36423938e-04 6.56360935e-04
 6.75858755e-04 6.79097837e-04 7.00469827e-04 7.05525803e-04
 7.35779875e-04 7.47704995e-04 7.81524810e-04 7.98267254e-04
 8.21693102e-04 9.33263858e-04 9.45827691e-04 9.72469221e-04
 9.89662833e-04 9.92853427e-04 1.01081666e-03 1.04294647e-03
 1.04454008e-03 1.04972383e-03 1.05285691e-03 1.05973973e-03
 1.07484148e-03 1.08457590e-03 1.10268581e-03 1.12434081e-03
 1.14494795e-03 1.15139328e-03 1.17061171e-03 1.18661940e-03
 1.24276371e-03 1.29680778e-03 1.31867221e-03 1.44795922e-03
 1.56493811e-03 1.61495444e-03 1.65214005e-03 1.69188995e-03
 1.80765823e-03 1.86003116e-03 1.90357771e-03 1.94085762e-03
 1.96319749e-03 2.10982189e-03 2.21060892e-03 2.53220042e-03
 2.60955282e-03 2.63344147e-03 2.63724546e-03 2.65168841e-03
 2.68658251e-03 2.76857032e-03 2.82136258e-03 2.87449546e-03
 2.99509475e-03 3.02256388e-03 3.12633999e-03 3.13722529e-03
 3.41721484e-03 3.44115961e-03 3.52469017e-03 3.73639143e-03
 3.83520126e-03 3.90244904e-03 4.27996507e-03 4.38119797e-03
 4.49576136e-03 4.64812294e-03 5.13938162e-03 5.47109591e-03
 5.66002959e-03 6.16141642e-03 6.84734341e-03 9.20873415e-03
 1.04834912e-02 1.18552931e-02 1.25425709e-02 1.31127713e-02
 1.38130784e-02 2.41148882e-02 2.83758435e-02 4.22262028e-02
 4.42106090e-02 5.10912351e-02 5.58977053e-02 5.80743290e-02
 6.67559952e-02 3.73314440e-01]
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 0.0, n = 294, score = 0.1634269756512917
thresh = 2.21226991925505e-06, n = 221, score = 0.1634269756512917
thresh = 2.8993244995945133e-06, n = 220, score = 0.1634269756512917
thresh = 4.01458919441211e-06, n = 219, score = 0.1635835783361062
thresh = 4.20125570599339e-06, n = 218, score = 0.1635789199023971
thresh = 4.5545334614871535e-06, n = 217, score = 0.1635813744412464
thresh = 9.041425073519349e-06, n = 216, score = 0.16361114562337073
thresh = 9.960662282537669e-06, n = 215, score = 0.16359462119264123
thresh = 1.0025464689533692e-05, n = 214, score = 0.1635937181987366
thresh = 1.1989065569650847e-05, n = 213, score = 0.16353696589123407
thresh = 1.2496981980802957e-05, n = 212, score = 0.16375139615480133
thresh = 1.331924522673944e-05, n = 211, score = 0.1637449208155299
thresh = 1.5067105778143741e-05, n = 210, score = 0.1637449208155299
thresh = 1.57984031829983e-05, n = 209, score = 0.1642321668826825
thresh = 1.6448830137960613e-05, n = 208, score = 0.16405079653996815
thresh = 1.654568586673122e-05, n = 207, score = 0.1632326678272285
thresh = 1.665069612499792e-05, n = 206, score = 0.16252760604953023
thresh = 1.6708649127394892e-05, n = 205, score = 0.16240455414022192
thresh = 1.6837062503327616e-05, n = 204, score = 0.1633090720960328
thresh = 1.6888023310457356e-05, n = 203, score = 0.16214144060679214
thresh = 2.2471807824331336e-05, n = 202, score = 0.16326741926876243
thresh = 2.454106834193226e-05, n = 201, score = 0.16326741926876243
thresh = 2.5041734261321835e-05, n = 200, score = 0.16309584143571731
thresh = 2.655260504980106e-05, n = 199, score = 0.16310408282125827
thresh = 2.6667796191759408e-05, n = 198, score = 0.16310408282125827
thresh = 2.817391577991657e-05, n = 197, score = 0.16310408282125827
thresh = 3.155170998070389e-05, n = 196, score = 0.16339547132461527
thresh = 3.2070565794128925e-05, n = 195, score = 0.16339956137815564
thresh = 3.271224341006018e-05, n = 194, score = 0.16339956137815564
thresh = 3.299289528513327e-05, n = 193, score = 0.16352218599509166
thresh = 3.299687523394823e-05, n = 192, score = 0.16353020221215883
thresh = 3.395284147700295e-05, n = 191, score = 0.16401581124232292
thresh = 3.733456469490193e-05, n = 190, score = 0.16401581124232292
thresh = 4.0794060623738915e-05, n = 189, score = 0.1633548655544686
thresh = 4.345454362919554e-05, n = 188, score = 0.1625988150476915
thresh = 4.40089097537566e-05, n = 187, score = 0.16251178915083403
thresh = 4.597420775098726e-05, n = 186, score = 0.16273098610625014
thresh = 5.133985177963041e-05, n = 185, score = 0.1623748258496059
thresh = 5.29541248397436e-05, n = 184, score = 0.16306568065478022
thresh = 5.519510159501806e-05, n = 183, score = 0.16306568065478022
thresh = 5.892331319046207e-05, n = 182, score = 0.16306545807539408
thresh = 5.9393754781922325e-05, n = 181, score = 0.1627641191475676
thresh = 6.131226109573618e-05, n = 180, score = 0.16232541907143588
thresh = 6.183402001624927e-05, n = 179, score = 0.16165375976178925
thresh = 6.731072789989412e-05, n = 178, score = 0.1616147201897199
thresh = 6.826286698924378e-05, n = 177, score = 0.16225508227129698
thresh = 6.885168113512918e-05, n = 176, score = 0.16186094233915702
thresh = 7.016519521130249e-05, n = 175, score = 0.16255273603058806
thresh = 7.583852129755542e-05, n = 174, score = 0.16255273603058806
thresh = 7.660536357434466e-05, n = 173, score = 0.16240791832631746
thresh = 7.913343142718077e-05, n = 172, score = 0.16408492230212596
thresh = 8.019879896892235e-05, n = 171, score = 0.16554639191158738
thresh = 8.45546746859327e-05, n = 170, score = 0.16412678725386642
thresh = 8.551937935408205e-05, n = 169, score = 0.16392377325903976
thresh = 8.72424352564849e-05, n = 168, score = 0.16501163023644486
thresh = 9.192818106384948e-05, n = 167, score = 0.1612071772276545
thresh = 9.603598300600424e-05, n = 166, score = 0.16168144378177213
thresh = 0.00010398461017757654, n = 165, score = 0.1608330671877611
thresh = 0.00010796294372994453, n = 164, score = 0.160868133700044
thresh = 0.00010894180013565347, n = 163, score = 0.16192426505369828
thresh = 0.00011357712355675176, n = 162, score = 0.16155774994131725
thresh = 0.00012225777027197182, n = 161, score = 0.162509940499741
thresh = 0.00013421248877421021, n = 160, score = 0.16242071573314554
thresh = 0.0001494891766924411, n = 159, score = 0.1643892302937634
thresh = 0.00014985792222432792, n = 158, score = 0.1643892302937634
thresh = 0.00015185594384092838, n = 157, score = 0.1638682109182789
thresh = 0.00015207671094685793, n = 156, score = 0.16469652920000286
thresh = 0.00015339319361373782, n = 155, score = 0.16305642519498012
thresh = 0.00015926074411254376, n = 154, score = 0.16283903028472813
thresh = 0.00016786121705081314, n = 153, score = 0.16314592703845424
thresh = 0.00017680659948382527, n = 152, score = 0.1643053028990939
thresh = 0.0001807132357498631, n = 151, score = 0.16356331612515632
thresh = 0.00018267426639795303, n = 150, score = 0.16434283180710632
thresh = 0.00018536261632107198, n = 149, score = 0.16434283180710632
thresh = 0.00018653733422979712, n = 148, score = 0.1653295125571202
thresh = 0.00018930093210656196, n = 147, score = 0.1625449962063003
thresh = 0.00019350340880919248, n = 146, score = 0.16299693193547804
thresh = 0.0002064484142465517, n = 145, score = 0.16361440676890054
thresh = 0.00020805394160561264, n = 144, score = 0.16368977688955758
thresh = 0.00020853678870480508, n = 143, score = 0.16255519616685787
thresh = 0.000209548874408938, n = 142, score = 0.1641198542763368
thresh = 0.00021055099205113947, n = 141, score = 0.16325382141903572
thresh = 0.0002156608534278348, n = 140, score = 0.16325382141903572
thresh = 0.0002237345470348373, n = 139, score = 0.16173847547448797
thresh = 0.0002266293449793011, n = 138, score = 0.1623182921433018
thresh = 0.00022795189579483122, n = 137, score = 0.16233337429418535
thresh = 0.00023008535208646208, n = 136, score = 0.16159519657109647
thresh = 0.00023604181478731334, n = 135, score = 0.163579003818478
thresh = 0.00023810521815903485, n = 134, score = 0.16061957410222374
thresh = 0.00024398427922278643, n = 133, score = 0.16596828868389774
thresh = 0.00024452953948639333, n = 132, score = 0.1657755657445966
thresh = 0.0002452530898153782, n = 131, score = 0.16395945945497026
thresh = 0.0002507020253688097, n = 130, score = 0.16450701322995906
thresh = 0.00025665725115686655, n = 129, score = 0.16367981639020554
thresh = 0.00026431717560626566, n = 128, score = 0.16473021810564228
thresh = 0.00026790067204274237, n = 127, score = 0.16396632144606818
thresh = 0.000271077937213704, n = 126, score = 0.16396632144606818
thresh = 0.0002872705226764083, n = 125, score = 0.16354736923901833
thresh = 0.0002953234943561256, n = 124, score = 0.1652204162014962
thresh = 0.00029645825270563364, n = 123, score = 0.16373418031526485
thresh = 0.0003040907031390816, n = 122, score = 0.16374166529757242
thresh = 0.0003093489503953606, n = 121, score = 0.16578630155126245
thresh = 0.0003111283585894853, n = 120, score = 0.16599253432773428
thresh = 0.00031171549926511943, n = 119, score = 0.1652697314394485
thresh = 0.0003275584022048861, n = 118, score = 0.1649785715129932
thresh = 0.0003298936353530735, n = 117, score = 0.16558838159126518
thresh = 0.0003307733277324587, n = 116, score = 0.16380566984753903
thresh = 0.0003398055268917233, n = 115, score = 0.16559102459099373
thresh = 0.0003462076128926128, n = 114, score = 0.16293950390118359
thresh = 0.0003505565400701016, n = 113, score = 0.16566089599854528
thresh = 0.00035595157532952726, n = 112, score = 0.1632106660019096
thresh = 0.000366762193152681, n = 111, score = 0.16305305978066092
thresh = 0.0003684543480630964, n = 110, score = 0.16447802792574603
thresh = 0.000369566201698035, n = 109, score = 0.16509355390365787
thresh = 0.0003730521129909903, n = 108, score = 0.16411459468982148
thresh = 0.00040873640682548285, n = 107, score = 0.16374926836482503
thresh = 0.0004349896917119622, n = 106, score = 0.16367785169581325
thresh = 0.00043551510316319764, n = 105, score = 0.1636866731052735
thresh = 0.0004499510978348553, n = 104, score = 0.1616459879131297
thresh = 0.00046201475197449327, n = 103, score = 0.16396574130177524
thresh = 0.0004708613851107657, n = 102, score = 0.1635921707647243
thresh = 0.0004750469233840704, n = 101, score = 0.1635921707647243
thresh = 0.00047730098594911397, n = 100, score = 0.16399328347802072
thresh = 0.0004928628331981599, n = 99, score = 0.16564605568682592
thresh = 0.0004940327489748597, n = 98, score = 0.16325210406804963
thresh = 0.000519220600835979, n = 97, score = 0.16261771100132597
thresh = 0.0005504252621904016, n = 96, score = 0.1634582229497553
thresh = 0.0005566497566178441, n = 95, score = 0.16179959210227152
thresh = 0.0005631542298942804, n = 94, score = 0.16451741441940848
thresh = 0.0005792519077658653, n = 93, score = 0.16380661635734156
thresh = 0.0005947739700786769, n = 92, score = 0.16174063443377779
thresh = 0.0006210298161022365, n = 91, score = 0.16405510686507763
thresh = 0.0006293375045061111, n = 90, score = 0.16201802505482488
thresh = 0.0006316449726000428, n = 89, score = 0.16227768314190222
thresh = 0.0006364239379763603, n = 88, score = 0.1613357753838816
thresh = 0.0006563609349541366, n = 87, score = 0.15489379676447132
thresh = 0.0006758587551303208, n = 86, score = 0.15996636687018856
thresh = 0.0006790978368371725, n = 85, score = 0.16409704605134612
thresh = 0.0007004698272794485, n = 84, score = 0.1611161735435722
thresh = 0.0007055258029140532, n = 83, score = 0.15680771378798877
thresh = 0.0007357798749580979, n = 82, score = 0.15924861727829498
thresh = 0.0007477049948647618, n = 81, score = 0.15728230052812234
thresh = 0.0007815248100087047, n = 80, score = 0.15719261836399
thresh = 0.000798267254140228, n = 79, score = 0.16172815389738718
thresh = 0.0008216931018978357, n = 78, score = 0.15949682699106826
thresh = 0.0009332638583146036, n = 77, score = 0.16160607772821436
thresh = 0.0009458276908844709, n = 76, score = 0.16001279746476574
thresh = 0.0009724692208692431, n = 75, score = 0.16216081247759648
thresh = 0.0009896628325805068, n = 74, score = 0.16189270681592866
thresh = 0.0009928534273058176, n = 73, score = 0.16246173173728296
thresh = 0.0010108166607096791, n = 72, score = 0.1592728379608147
thresh = 0.0010429464746266603, n = 71, score = 0.15917095371048154
thresh = 0.0010445400839671493, n = 70, score = 0.15930647656578448
thresh = 0.001049723825417459, n = 69, score = 0.15966212326617918
thresh = 0.0010528569109737873, n = 68, score = 0.16189855076065846
thresh = 0.0010597397340461612, n = 67, score = 0.16183779727966732
thresh = 0.0010748414788395166, n = 66, score = 0.16183779727966732
thresh = 0.0010845758952200413, n = 65, score = 0.16008020907150441
thresh = 0.0011026858119294047, n = 64, score = 0.16718195491600213
thresh = 0.001124340808019042, n = 63, score = 0.16039879801638024
thresh = 0.001144947949796915, n = 62, score = 0.1611256611827503
thresh = 0.0011513932840898633, n = 61, score = 0.1611256611827503
thresh = 0.0011706117074936628, n = 60, score = 0.16379329884078375
thresh = 0.0011866193963214755, n = 59, score = 0.16408328930169308
thresh = 0.0012427637120708823, n = 58, score = 0.16537973221694566
thresh = 0.0012968077789992094, n = 57, score = 0.16442913579291546
thresh = 0.0013186722062528133, n = 56, score = 0.16507222814151795
thresh = 0.0014479592209681869, n = 55, score = 0.16703247619403705
thresh = 0.0015649381093680859, n = 54, score = 0.16808817122288616
thresh = 0.0016149544389918447, n = 53, score = 0.16466868303634474
thresh = 0.0016521400539204478, n = 52, score = 0.16458397126306654
thresh = 0.001691889949142933, n = 51, score = 0.1666959750184447
thresh = 0.0018076582346111536, n = 50, score = 0.16842319933852176
thresh = 0.001860031159594655, n = 49, score = 0.1694556190927747
thresh = 0.001903577707707882, n = 48, score = 0.16781088785246723
thresh = 0.001940857619047165, n = 47, score = 0.17114455865700695
thresh = 0.00196319748647511, n = 46, score = 0.16559975270137217
thresh = 0.0021098218858242035, n = 45, score = 0.16636048839832915
thresh = 0.0022106089163571596, n = 44, score = 0.16228946266195293
thresh = 0.0025322004221379757, n = 43, score = 0.16706401646520208
thresh = 0.002609552815556526, n = 42, score = 0.16519764397156145
thresh = 0.002633441472426057, n = 41, score = 0.16357918623510773
thresh = 0.002637245459482074, n = 40, score = 0.16505931778783206
thresh = 0.0026516884099692106, n = 39, score = 0.16802911621134767
thresh = 0.0026865825057029724, n = 38, score = 0.16695704165547676
thresh = 0.002768570324406028, n = 37, score = 0.1676591145615499
thresh = 0.00282136257737875, n = 36, score = 0.16975460824029226
thresh = 0.0028744954615831375, n = 35, score = 0.16330457796774298
thresh = 0.0029950947500765324, n = 34, score = 0.17694671133982054
thresh = 0.0030225638765841722, n = 33, score = 0.17648102502023405
thresh = 0.0031263399869203568, n = 32, score = 0.18014432945641684
thresh = 0.0031372252851724625, n = 31, score = 0.17931339793706613
thresh = 0.0034172148443758488, n = 30, score = 0.17955741533055633
thresh = 0.0034411596134305, n = 29, score = 0.17655572295901728
thresh = 0.003524690167978406, n = 28, score = 0.17824446649492398
thresh = 0.003736391430720687, n = 27, score = 0.17675593365353548
thresh = 0.0038352012634277344, n = 26, score = 0.17828571170297894
thresh = 0.0039024490397423506, n = 25, score = 0.1710506549793859
thresh = 0.004279965069144964, n = 24, score = 0.1747581895578411
thresh = 0.0043811979703605175, n = 23, score = 0.17549665567733588
thresh = 0.0044957613572478294, n = 22, score = 0.17530776404500276
thresh = 0.004648122936487198, n = 21, score = 0.17766274634743445
thresh = 0.0051393816247582436, n = 20, score = 0.17815268262237638
thresh = 0.005471095908433199, n = 19, score = 0.17804171380249728
thresh = 0.005660029593855143, n = 18, score = 0.17956257680284474
thresh = 0.006161416415125132, n = 17, score = 0.18127994476276035
thresh = 0.006847343407571316, n = 16, score = 0.18390724420240004
thresh = 0.009208734147250652, n = 15, score = 0.18680562071335757
thresh = 0.010483491234481335, n = 14, score = 0.18684297119149312
thresh = 0.011855293065309525, n = 13, score = 0.18591620833137226
thresh = 0.012542570941150188, n = 12, score = 0.18655437837361516
thresh = 0.013112771324813366, n = 11, score = 0.187595422816317
thresh = 0.0138130784034729, n = 10, score = 0.18590297656149943
thresh = 0.024114888161420822, n = 9, score = 0.20589942195000205
thresh = 0.028375843539834023, n = 8, score = 0.202828827635469
thresh = 0.04222620278596878, n = 7, score = 0.21003656282444372
thresh = 0.044210609048604965, n = 6, score = 0.2185621315162222
thresh = 0.051091235131025314, n = 5, score = 0.2191857107199869
thresh = 0.055897705256938934, n = 4, score = 0.21340924911246614
thresh = 0.05807432904839516, n = 3, score = 0.213585922331518
thresh = 0.0667559951543808, n = 2, score = 0.22994426940573962
thresh = 0.37331444025039673, n = 1, score = 0.23095892562782216
**********
best_score 0.1309254483960009
best_thresh 0
best_n 0
best_selection 0
run time is: 262 秒
计算特征与目标的相关系数以及P值
# 相关系数——特征与目标变量
from sklearn.feature_selection import SelectKBest
from scipy.stats import pearsonr
def xgb_eval2(X, Y):
    start = datetime.datetime.now()
    
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=666)

    model = xgb.XGBRegressor(
        n_jobs=-1,
        random_state=666
    )
    
    param_grid = {
        'learning_rate': [0.1],
        'n_estimators': [100]
    }
    gridsearch = GridSearchCV(
        model, 
        param_grid=param_grid, 
        cv=5, 
        verbose=3, 
        n_jobs=-1
    )
    gridsearch.fit(X_train, y_train)
     
    print('best param is: ', gridsearch.best_params_)
    model = gridsearch.best_estimator_
    y_pred = model.predict(X_test)
    score = np.sqrt(mean_squared_log_error(y_test, y_pred))
    print('score is: ',score)
    end = datetime.datetime.now()
    print('run time is:',(end-start).seconds,'秒')
    
    return score, model


data_tmp = data.copy()
train = data_tmp[data_tmp['SalePrice'].notnull()]
X=train.drop(['SalePrice'],axis=1)
Y=train.loc[:,'SalePrice'].values

fun = lambda X, Y: tuple(map(tuple, np.array(list(map(lambda x: pearsonr(x, Y), X.T))).T))
for ki in range(1, len(train.columns.tolist())):
    sb = SelectKBest(fun, k=ki)
    x_fit = sb.fit(X, Y)
    x_sb = x_fit.transform(X)
    X_newcolumnsname = train.iloc[:, x_fit.get_support(indices=True)].columns.tolist()
    #     print('>>>检验统计值(相关系数):\n', sb.scores_)
    #     print('\n>>>P值:\n', sb.pvalues_)
    score,_ = xgb_eval2(x_sb, Y) 
    if score < last_score:
        print('score: ',score)
        print('X_newcolumnsname: ',X_newcolumnsname)
        print(f'特征个数{len(X_newcolumnsname)}')
        last_score = score
        X_newcolumnsname.append('SalePrice')
        data = data_tmp[X_newcolumnsname]       

print('********') 
print(last_score)
    
    
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


best param is:  {'learning_rate': 0.1, 'n_estimators': 100}
score is:  0.23095657084432536
run time is: 0 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.1s finished


best param is:  {'learning_rate': 0.1, 'n_estimators': 100}
score is:  0.22204959452117293
run time is: 0 秒
Fitting 5 folds for each of 1 candidates, totalling 5 fits

… …

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    3.4s remaining:    5.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.3s finished


best param is:  {'learning_rate': 0.1, 'n_estimators': 100}
score is:  0.1509751277940482
run time is: 6 秒
********
0.1309254483960009
len(data.columns)
295

train = data[data['SalePrice'].notnull()]
test = data[data['SalePrice'].isnull()]
# data.to_csv('data__after_feature_engineering.csv',index=False)
# train.to_csv('train_after_feature_engineering.csv',index=False)
# test.to_csv('test_after_feature_engineering.csv',index=False)

模型调优

确定一个固定的学习率(学习率越小,时间成本越高):

learning_rate的取值范围一般在[0.01,0.3]之间

如果时间充裕,可以把learning_rate设置的更小;

如果时间紧张,可以把learning_rate设置的更大;

1)调试n_estimators;

2)调试min_child_weight以及max_depth;

3)调试gamma;

4)调试subsample、colsample_bytree;

5)调试正则化参数:reg_alpha、reg_lambda;

# 自定义scoring
# https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
from sklearn.metrics import make_scorer
def xgb_eval(data, cv_params, other_params):
    def my_error_func(y_ture, y_pred):
        error = np.sqrt(mean_squared_log_error(y_ture, y_pred))
        return error
    my_score = make_scorer(my_error_func, greater_is_better=False)
    
    start = datetime.datetime.now()
    
    train_df = copy.deepcopy(data)    
    X_train=train_df.drop(['SalePrice'],axis=1)
    y_train=train_df.loc[:,'SalePrice'].values
    
#     X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=666)

    xgb_reg = xgb.XGBRegressor(**other_params)
    
    param_grid = {
        'learning_rate': [0.1],
        'n_estimators': [100]
    }
    gridsearch = GridSearchCV(
        estimator=xgb_reg, 
        param_grid=cv_params, 
        scoring=my_score,
        cv=5, 
        verbose=3, 
        n_jobs=-1
    )
    gridsearch.fit(X_train, y_train)
    print('参数的最佳取值:{0}'.format(gridsearch.best_params_))
    print('最佳模型得分:{0}'.format(-gridsearch.best_score_))
    
    end = datetime.datetime.now()
    print('run time is:',(end-start).seconds,'秒')
    
    return -gridsearch.best_score_, gridsearch.cv_results_

调n_estimators

## 粗调
#粗调
cv_params = {
    'n_estimators': np.arange(10, 1500, 50),
}
other_params = {
    'learning_rate': 0.1, 
    'max_depth': 5, 
    'min_child_weight': 1, 
    'subsample': 0.8, 
    'colsample_bytree': 0.8, 
    'gamma': 0, 
    'reg_alpha': 0, 
    'reg_lambda': 1,
    'n_jobs':-1,
    'random_state':666
}

data_tmp = data.copy()
train = data_tmp[data_tmp['SalePrice'].notnull()]
score, cv_results = xgb_eval(train, cv_params, other_params)
plt.plot(np.arange(10, 1500, 50), cv_results['mean_test_score'])
plt.show()
Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   26.8s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed: 10.4min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 16.2min finished


参数的最佳取值:{'n_estimators': 460}
最佳模型得分:0.1288088360602727
run time is: 977 秒

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-NUHSubOP-1615804774093)(output_273_3.png)]

# 参数的最佳取值:{'n_estimators': 460}
# 最佳模型得分:0.1288088360602727
# 细调
cv_params = {
    'n_estimators': np.arange(460-50, 460+50,1),
}
other_params = {
    'learning_rate': 0.1, 
    'max_depth': 5, 
    'min_child_weight': 1, 
    'subsample': 0.8, 
    'colsample_bytree': 0.8, 
    'gamma': 0, 
    'reg_alpha': 0, 
    'reg_lambda': 1,
    'n_jobs':-1,
    'random_state':666
}
data_tmp = data.copy()
train = data_tmp[data_tmp['SalePrice'].notnull()]
score, cv_results = xgb_eval(train, cv_params, other_params)
plt.plot(np.arange(460-50, 460+50,1), cv_results['mean_test_score'])
plt.show()
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 16.9min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 31.1min finished


参数的最佳取值:{'n_estimators': 452}
最佳模型得分:0.12878427962473019
run time is: 1870 秒

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-IBbJsmb6-1615804774096)(output_275_3.png)]

调max_depth, min_child_weight

cv_params = {
    'max_depth':range(1,10,1),
    'min_child_weight':range(1,10,1)
}
other_params = {
    'learning_rate': 0.1, 
    'n_estimators':452,
    'max_depth': 5, 
    'min_child_weight': 1, 
    'subsample': 0.8, 
    'colsample_bytree': 0.8, 
    'gamma': 0, 
    'reg_alpha': 0, 
    'reg_lambda': 1,
    'n_jobs':-1,
    'random_state':666
}
train = data[data['SalePrice'].notnull()]
score, cv_results = xgb_eval(train, cv_params, other_params)
Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   22.9s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed: 12.6min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed: 24.3min finished


参数的最佳取值:{'max_depth': 4, 'min_child_weight': 4}
最佳模型得分:0.12662196605686601
run time is: 1460 秒

调gamma

cv_params = {
    'gamma':[i/10.0 for i in range(0,5)]
}

other_params = {
    'learning_rate': 0.1, 
    'n_estimators':452,
    'max_depth': 4, 
    'min_child_weight': 4, 
    'subsample': 0.8, 
    'colsample_bytree': 0.8, 
    'gamma': 0, 
    'reg_alpha': 0, 
    'reg_lambda': 1,
    'n_jobs':-1,
    'random_state':666
}

train = data[data['SalePrice'].notnull()]
score, cv_results = xgb_eval(train, cv_params, other_params)
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  1.4min finished


参数的最佳取值:{'gamma': 0.0}
最佳模型得分:0.12662196605686601
run time is: 89 秒

调优subsample 和 colsample_bytree

cv_params = {
    'subsample':[i/10.0 for i in range(4,10)],
    'colsample_bytree':[i/10.0 for i in range(4,10)]
}
other_params = {
    'learning_rate': 0.1, 
    'n_estimators':452,
    'max_depth': 4, 
    'min_child_weight': 4, 
    'subsample': 0.8, 
    'colsample_bytree': 0.8, 
    'gamma': 0, 
    'reg_alpha': 0, 
    'reg_lambda': 1,
    'n_jobs':-1,
    'random_state':666
}

train = data[data['SalePrice'].notnull()]
score, cv_results = xgb_eval(train, cv_params, other_params)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   38.8s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  7.3min finished


参数的最佳取值:{'colsample_bytree': 0.6, 'subsample': 0.9}
最佳模型得分:0.1248652563014746
run time is: 439 秒

调优正则化参数

cv_params = {
    'reg_alpha': [0.05, 0.1, 1, 2, 3, 10, 50, 100], 
    'reg_lambda': [0.05, 0.1, 1, 2, 3]
}
other_params = {
    'learning_rate': 0.1, 
    'n_estimators':452,
    'max_depth': 4, 
    'min_child_weight': 4, 
    'subsample': 0.9, 
    'colsample_bytree': 0.6, 
    'gamma': 0, 
    'reg_alpha': 0, 
    'reg_lambda': 1,
    'n_jobs':-1,
    'random_state':666
}

train = data[data['SalePrice'].notnull()]
score, cv_results = xgb_eval(train, cv_params, other_params)
Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   52.2s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  7.8min finished


参数的最佳取值:{'reg_alpha': 1, 'reg_lambda': 1}
最佳模型得分:0.1246989760441339
run time is: 471 秒
##至此,XGBoost模型调参结束,确定了最优参数和模型:
best_params = {
    'learning_rate': 0.1, 
    'n_estimators':452,
    'max_depth': 4, 
    'min_child_weight': 4, 
    'subsample': 0.9, 
    'colsample_bytree': 0.6, 
    'gamma': 0, 
    'reg_alpha': 1, 
    'reg_lambda': 1,
    'n_jobs':-1,
    'random_state':666
}
best_xgb = xgb.XGBRegressor(**best_params)

X_train = data[data['SalePrice'].notnull()].drop(['SalePrice'],axis=1)
Y_train = data[data['SalePrice'].notnull()].loc[:,'SalePrice'].values
X_test = data[data['SalePrice'].isnull()].drop(['SalePrice'], axis=1)
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
(1460, 294)
(1460,)
(1459, 294)

模型集成(融合)

基模型xgboost,使用bagging进行融合

from sklearn.ensemble import BaggingRegressor
def my_error_func(y_ture, y_pred):
    error = np.sqrt(mean_squared_log_error(y_ture, y_pred))
    return error
my_score = make_scorer(my_error_func, greater_is_better=False)
regr = BaggingRegressor(
    base_estimator = best_xgb,
    n_jobs = -1,
    random_state = 66
)
model = GridSearchCV(
        estimator=regr, 
        param_grid={'n_estimators':np.arange(1,20,1)}, 
        scoring=my_score, 
        cv=5, 
        verbose=3, 
        n_jobs=-1
)
model.fit(X_train, Y_train)
print('参数的最佳取值:{0}'.format(model.best_params_))
print('最佳模型得分:{0}'.format(-model.best_score_))
Fitting 5 folds for each of 19 candidates, totalling 95 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  95 out of  95 | elapsed: 38.4min finished


参数的最佳取值:{'n_estimators': 19}
最佳模型得分:0.12425136280271949
model.best_estimator_
BaggingRegressor(base_estimator=XGBRegressor(base_score=None, booster=None,
                                             colsample_bylevel=None,
                                             colsample_bynode=None,
                                             colsample_bytree=0.6, gamma=0,
                                             gpu_id=None,
                                             importance_type='gain',
                                             interaction_constraints=None,
                                             learning_rate=0.1,
                                             max_delta_step=None, max_depth=4,
                                             min_child_weight=4, missing=nan,
                                             monotone_constraints=None,
                                             n_estimators=452, n_jobs=-1,
                                             num_parallel_tree=None,
                                             random_state=666, reg_alpha=1,
                                             reg_lambda=1,
                                             scale_pos_weight=None,
                                             subsample=0.9, tree_method=None,
                                             validate_parameters=None,
                                             verbosity=None),
                 n_estimators=19, n_jobs=-1, random_state=66)
y_pred = model.best_estimator_.predict(X_test)
y_pred
array([125902.945, 163661.55 , 183499.3  , ..., 162691.47 , 113986.79 ,
       220476.8  ], dtype=float32)
y_df = pd.DataFrame(data=y_pred,columns=['SalePrice']) 
y_df.head()
SalePrice
0125902.945312
1163661.546875
2183499.296875
3191795.031250
4186873.843750
submission = pd.concat([X_test['Id'],y_df],axis=1)
submission.head()
IdSalePrice
01461125902.945312
11462163661.546875
21463183499.296875
31464191795.031250
41465186873.843750
submission.to_csv('submission.csv',index=False)
# from sklearn.metrics import mean_squared_log_error

# RMSLE = np.sqrt( mean_squared_log_error(y_true, y_pred) )

# print("The score is %.5f" % RMSLE )
# train = data[data['SalePrice'].notnull()]
# test = data[data['SalePrice'].isnull()]
train.to_csv('./xgb1/train_after_fe.csv',index=False)
test.to_csv('./xgb1/test_after_fe.csv',index=False)

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值