import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
import seaborn as sns
import warnings
from sklearn. ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn. model_selection import cross_val_score, GridSearchCV
pd. options. display. max_columns = None
warnings. filterwarnings( 'ignore' )
% matplotlib inline
df_train = pd. read_csv( 'datalab/231784/used_car_train_20200313.csv' , sep= ' ' )
df_test = pd. read_csv( 'datalab/231784/used_car_testA_20200313.csv' , sep= ' ' )
train = df_train. drop( [ 'SaleID' ] , axis= 1 )
test = df_test. drop( [ 'SaleID' ] , axis= 1 )
1. 数据初瞥
train. head( )
name regDate model brand bodyType fuelType gearbox power kilometer notRepairedDamage regionCode seller offerType creatDate price v_0 v_1 v_2 v_3 v_4 v_5 v_6 v_7 v_8 v_9 v_10 v_11 v_12 v_13 v_14 0 736 20040402 30.0 6 1.0 0.0 0.0 60 12.5 0.0 1046 0 0 20160404 1850 43.357796 3.966344 0.050257 2.159744 1.143786 0.235676 0.101988 0.129549 0.022816 0.097462 -2.881803 2.804097 -2.420821 0.795292 0.914762 1 2262 20030301 40.0 1 2.0 0.0 0.0 0 15.0 - 4366 0 0 20160309 3600 45.305273 5.236112 0.137925 1.380657 -1.422165 0.264777 0.121004 0.135731 0.026597 0.020582 -4.900482 2.096338 -1.030483 -1.722674 0.245522 2 14874 20040403 115.0 15 1.0 0.0 0.0 163 12.5 0.0 2806 0 0 20160402 6222 45.978359 4.823792 1.319524 -0.998467 -0.996911 0.251410 0.114912 0.165147 0.062173 0.027075 -4.846749 1.803559 1.565330 -0.832687 -0.229963 3 71865 19960908 109.0 10 0.0 0.0 1.0 193 15.0 0.0 434 0 0 20160312 2400 45.687478 4.492574 -0.050616 0.883600 -2.228079 0.274293 0.110300 0.121964 0.033395 0.000000 -4.509599 1.285940 -0.501868 -2.438353 -0.478699 4 111080 20120103 110.0 5 1.0 0.0 0.0 68 5.0 0.0 6977 0 0 20160313 5200 44.383511 2.031433 0.572169 -1.571239 2.246088 0.228036 0.073205 0.091880 0.078819 0.121534 -1.896240 0.910783 0.931110 2.834518 1.923482
test. head( )
name regDate model brand bodyType fuelType gearbox power kilometer notRepairedDamage regionCode seller offerType creatDate v_0 v_1 v_2 v_3 v_4 v_5 v_6 v_7 v_8 v_9 v_10 v_11 v_12 v_13 v_14 0 66932 20111212 222.0 4 5.0 1.0 1.0 313 15.0 0.0 1440 0 0 20160329 49.593127 5.246568 1.001130 -4.122264 0.737532 0.264405 0.121800 0.070899 0.106558 0.078867 -7.050969 -0.854626 4.800151 0.620011 -3.664654 1 174960 19990211 19.0 21 0.0 0.0 0.0 75 12.5 1.0 5419 0 0 20160404 42.395926 -3.253950 -1.753754 3.646605 -0.725597 0.261745 0.000000 0.096733 0.013705 0.052383 3.679418 -0.729039 -3.796107 -1.541230 -0.757055 2 5356 20090304 82.0 21 0.0 0.0 0.0 109 7.0 0.0 5045 0 0 20160308 45.841370 4.704178 0.155391 -1.118443 -0.229160 0.260216 0.112081 0.078082 0.062078 0.050540 -4.926690 1.001106 0.826562 0.138226 0.754033 3 50688 20100405 0.0 0 0.0 0.0 1.0 160 7.0 0.0 4023 0 0 20160325 46.440649 4.319155 0.428897 -2.037916 -0.234757 0.260466 0.106727 0.081146 0.075971 0.048268 -4.864637 0.505493 1.870379 0.366038 1.312775 4 161428 19970703 26.0 14 2.0 0.0 0.0 75 15.0 0.0 3103 0 0 20160309 42.184604 -3.166234 -1.572058 2.604143 0.387498 0.250999 0.000000 0.077806 0.028600 0.081709 3.616475 -0.673236 -3.197685 -0.025678 -0.101290
train. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 30 columns):
name 150000 non-null int64
regDate 150000 non-null int64
model 149999 non-null float64
brand 150000 non-null int64
bodyType 145494 non-null float64
fuelType 141320 non-null float64
gearbox 144019 non-null float64
power 150000 non-null int64
kilometer 150000 non-null float64
notRepairedDamage 150000 non-null object
regionCode 150000 non-null int64
seller 150000 non-null int64
offerType 150000 non-null int64
creatDate 150000 non-null int64
price 150000 non-null int64
v_0 150000 non-null float64
v_1 150000 non-null float64
v_2 150000 non-null float64
v_3 150000 non-null float64
v_4 150000 non-null float64
v_5 150000 non-null float64
v_6 150000 non-null float64
v_7 150000 non-null float64
v_8 150000 non-null float64
v_9 150000 non-null float64
v_10 150000 non-null float64
v_11 150000 non-null float64
v_12 150000 non-null float64
v_13 150000 non-null float64
v_14 150000 non-null float64
dtypes: float64(20), int64(9), object(1)
memory usage: 34.3+ MB
test. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 29 columns):
name 50000 non-null int64
regDate 50000 non-null int64
model 50000 non-null float64
brand 50000 non-null int64
bodyType 48587 non-null float64
fuelType 47107 non-null float64
gearbox 48090 non-null float64
power 50000 non-null int64
kilometer 50000 non-null float64
notRepairedDamage 50000 non-null object
regionCode 50000 non-null int64
seller 50000 non-null int64
offerType 50000 non-null int64
creatDate 50000 non-null int64
v_0 50000 non-null float64
v_1 50000 non-null float64
v_2 50000 non-null float64
v_3 50000 non-null float64
v_4 50000 non-null float64
v_5 50000 non-null float64
v_6 50000 non-null float64
v_7 50000 non-null float64
v_8 50000 non-null float64
v_9 50000 non-null float64
v_10 50000 non-null float64
v_11 50000 non-null float64
v_12 50000 non-null float64
v_13 50000 non-null float64
v_14 50000 non-null float64
dtypes: float64(20), int64(8), object(1)
memory usage: 11.1+ MB
1.1 ‘notRepairedDamage’列是唯一的非数值型特征,只有0或1或’-’, 应该转换数据类型,并将‘-’变为空值
train[ 'notRepairedDamage' ] = train[ 'notRepairedDamage' ] . replace( '-' , np. nan)
test[ 'notRepairedDamage' ] = test[ 'notRepairedDamage' ] . replace( '-' , np. nan)
train[ 'notRepairedDamage' ] = train[ 'notRepairedDamage' ] . astype( 'float64' )
test[ 'notRepairedDamage' ] = test[ 'notRepairedDamage' ] . astype( 'float64' )
train[ 'notRepairedDamage' ] . unique( ) , test[ 'notRepairedDamage' ] . unique( )
(array([ 0., nan, 1.]), array([ 0., 1., nan]))
test. describe( )
name regDate model brand bodyType fuelType gearbox power kilometer notRepairedDamage regionCode seller offerType creatDate v_0 v_1 v_2 v_3 v_4 v_5 v_6 v_7 v_8 v_9 v_10 v_11 v_12 v_13 v_14 count 50000.000000 5.000000e+04 50000.000000 50000.000000 48587.000000 47107.000000 48090.000000 50000.000000 50000.000000 41969.000000 50000.000000 50000.0 50000.0 5.000000e+04 50000.000000 50000.000000 50000.000000 50000.000000 50000.000000 50000.000000 50000.000000 50000.000000 50000.000000 50000.000000 50000.000000 50000.000000 50000.000000 50000.000000 50000.000000 mean 68542.223280 2.003393e+07 46.844520 8.056240 1.782185 0.373405 0.224350 119.883620 12.595580 0.112464 2590.604820 0.0 0.0 2.016033e+07 44.418233 -0.037238 0.050534 0.084640 0.015001 0.248669 0.045021 0.122744 0.057997 0.062000 -0.017855 -0.013742 -0.013554 -0.003147 0.001516 std 61052.808133 5.368870e+04 49.469548 7.819477 1.760736 0.546442 0.417158 185.097387 3.908979 0.315940 1876.970263 0.0 0.0 7.951521e+01 2.429950 3.642562 2.856341 2.026510 1.193026 0.044601 0.051766 0.195972 0.029211 0.035653 3.747985 3.231258 2.515962 1.286597 1.027360 min 0.000000 1.991000e+07 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.500000 0.000000 0.000000 0.0 0.0 2.015061e+07 28.987024 -4.137733 -4.205728 -5.638184 -4.287718 0.000000 0.000000 0.000000 0.000000 0.000000 -9.160049 -5.411964 -8.916949 -4.123333 -6.112667 25% 11203.500000 1.999091e+07 10.000000 1.000000 0.000000 0.000000 0.000000 75.000000 12.500000 0.000000 1030.000000 0.0 0.0 2.016031e+07 43.139621 -3.191909 -0.971266 -1.453453 -0.928089 0.243762 0.000044 0.062644 0.035084 0.033714 -3.700121 -1.971325 -1.876703 -1.060428 -0.437920 50% 52248.500000 2.003091e+07 29.000000 6.000000 1.000000 0.000000 0.000000 109.000000 15.000000 0.000000 2219.000000 0.0 0.0 2.016032e+07 44.611084 -3.050756 -0.388117 0.097881 -0.070225 0.257877 0.000815 0.095828 0.057084 0.058764 1.613212 -0.355843 -0.142779 -0.035956 0.138799 75% 118856.500000 2.007110e+07 65.000000 13.000000 3.000000 1.000000 0.000000 150.000000 15.000000 0.000000 3857.000000 0.0 0.0 2.016033e+07 45.992639 3.997323 0.240548 1.562700 0.863731 0.265328 0.102025 0.125438 0.079077 0.087489 2.832708 1.262914 1.764335 0.941469 0.681163 max 196805.000000 2.015121e+07 246.000000 39.000000 7.000000 6.000000 1.000000 20000.000000 15.000000 1.000000 8121.000000 0.0 0.0 2.016041e+07 51.751684 7.553517 18.394570 9.381599 5.270150 0.291618 0.153265 1.358813 0.156355 0.214775 12.338872 18.856218 12.950498 5.913273 2.624622
train. describe( )
name regDate model brand bodyType fuelType gearbox power kilometer notRepairedDamage regionCode seller offerType creatDate price v_0 v_1 v_2 v_3 v_4 v_5 v_6 v_7 v_8 v_9 v_10 v_11 v_12 v_13 v_14 count 150000.000000 1.500000e+05 149999.000000 150000.000000 145494.000000 141320.000000 144019.000000 150000.000000 150000.000000 125676.000000 150000.000000 150000.000000 150000.0 1.500000e+05 150000.000000 150000.000000 150000.000000 150000.000000 150000.000000 150000.000000 150000.000000 150000.000000 150000.000000 150000.000000 150000.000000 150000.000000 150000.000000 150000.000000 150000.000000 150000.000000 mean 68349.172873 2.003417e+07 47.129021 8.052733 1.792369 0.375842 0.224943 119.316547 12.597160 0.113904 2583.077267 0.000007 0.0 2.016033e+07 5923.327333 44.406268 -0.044809 0.080765 0.078833 0.017875 0.248204 0.044923 0.124692 0.058144 0.061996 -0.001000 0.009035 0.004813 0.000313 -0.000688 std 61103.875095 5.364988e+04 49.536040 7.864956 1.760640 0.548677 0.417546 177.168419 3.919576 0.317696 1885.363218 0.002582 0.0 1.067328e+02 7501.998477 2.457548 3.641893 2.929618 2.026514 1.193661 0.045804 0.051743 0.201410 0.029186 0.035692 3.772386 3.286071 2.517478 1.288988 1.038685 min 0.000000 1.991000e+07 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.500000 0.000000 0.000000 0.000000 0.0 2.015062e+07 11.000000 30.451976 -4.295589 -4.470671 -7.275037 -4.364565 0.000000 0.000000 0.000000 0.000000 0.000000 -9.168192 -5.558207 -9.639552 -4.153899 -6.546556 25% 11156.000000 1.999091e+07 10.000000 1.000000 0.000000 0.000000 0.000000 75.000000 12.500000 0.000000 1018.000000 0.000000 0.0 2.016031e+07 1300.000000 43.135799 -3.192349 -0.970671 -1.462580 -0.921191 0.243615 0.000038 0.062474 0.035334 0.033930 -3.722303 -1.951543 -1.871846 -1.057789 -0.437034 50% 51638.000000 2.003091e+07 30.000000 6.000000 1.000000 0.000000 0.000000 110.000000 15.000000 0.000000 2196.000000 0.000000 0.0 2.016032e+07 3250.000000 44.610266 -3.052671 -0.382947 0.099722 -0.075910 0.257798 0.000812 0.095866 0.057014 0.058484 1.624076 -0.358053 -0.130753 -0.036245 0.141246 75% 118841.250000 2.007111e+07 66.000000 13.000000 3.000000 1.000000 0.000000 150.000000 15.000000 0.000000 3843.000000 0.000000 0.0 2.016033e+07 7700.000000 46.004721 4.000670 0.241335 1.565838 0.868758 0.265297 0.102009 0.125243 0.079382 0.087491 2.844357 1.255022 1.776933 0.942813 0.680378 max 196812.000000 2.015121e+07 247.000000 39.000000 7.000000 6.000000 1.000000 19312.000000 15.000000 1.000000 8120.000000 1.000000 0.0 2.016041e+07 99999.000000 52.304178 7.320308 19.035496 9.854702 6.829352 0.291838 0.151420 1.404936 0.160791 0.222787 12.357011 18.819042 13.847792 11.147669 8.658418
1.2 发现seller特征在训练集和测试集中偏斜极其严重,对预测没有帮助,删去
train. drop( [ 'seller' ] , axis= 1 , inplace= True )
test. drop( [ 'seller' ] , axis= 1 , inplace= True )
1.3 意外发现两个数据集的offerType列全为0,删去。
train = train. drop( [ 'offerType' ] , axis= 1 )
test = test. drop( [ 'offerType' ] , axis= 1 )
train. shape, test. shape
((150000, 28), (50000, 27))
2. 探索性数据分析
2.1 用图表展示各特征与售价之间的数量关系(事实证明该图表的绘制非常耗时)
2.2 由于数据量过大,受性能限制很难用可视化工具展示数据分布的特征。因地制宜,选用函数及赛题数据描述来完成探索性数据分析
赛题数据描述讲到, power范围为[0, 600], 然而
train[ train[ 'power' ] > 600 ] [ 'power' ] . count( )
143
test[ test[ 'power' ] > 600 ] [ 'power' ] . count( )
70
2.3 现在,特征工程能做的只是填充缺失值以及删除某些特征。在开始之前,先看看线性相关系数
train. corr( ) . unstack( ) [ 'price' ] . sort_values( ascending= False )
price 1.000000
v_12 0.692823
v_8 0.685798
v_0 0.628397
regDate 0.611959
gearbox 0.329075
bodyType 0.241303
power 0.219834
fuelType 0.200536
v_5 0.164317
model 0.136983
v_2 0.085322
v_6 0.068970
v_1 0.060914
v_14 0.035911
regionCode 0.014036
creatDate 0.002955
name 0.002030
v_13 -0.013993
brand -0.043799
v_7 -0.053024
v_4 -0.147085
notRepairedDamage -0.190623
v_9 -0.206205
v_10 -0.246175
v_11 -0.275320
kilometer -0.440519
v_3 -0.730946
dtype: float64
2.4 删去特征,同时删去测试集中相应的特征
train. drop( [ 'v_2' , 'v_6' , 'v_1' , 'v_14' , 'v_13' , 'v_7' , 'name' , 'creatDate' ] , axis= 1 , inplace= True )
test. drop( [ 'v_2' , 'v_6' , 'v_1' , 'v_14' , 'v_13' , 'v_7' , 'name' , 'creatDate' ] , axis= 1 , inplace= True )
train. shape, test. shape
((150000, 20), (50000, 19))
train. corr( ) . unstack( ) [ 'price' ] . sort_values( ascending= False )
price 1.000000
v_12 0.692823
v_8 0.685798
v_0 0.628397
regDate 0.611959
gearbox 0.329075
bodyType 0.241303
power 0.219834
fuelType 0.200536
v_5 0.164317
model 0.136983
regionCode 0.014036
brand -0.043799
v_4 -0.147085
notRepairedDamage -0.190623
v_9 -0.206205
v_10 -0.246175
v_11 -0.275320
kilometer -0.440519
v_3 -0.730946
dtype: float64
3. 特征工程
3.1 修正特征power大于600的值
train[ 'power' ] = train[ 'power' ] . map ( lambda x: train[ 'power' ] . median( ) if x > 600 else x)
test[ 'power' ] = test[ 'power' ] . map ( lambda x: test[ 'power' ] . median( ) if x > 600 else x)
train[ 'power' ] . plot. hist( )
test[ 'power' ] . plot. hist( )
3.2 填充缺失值
train. isnull( ) . sum ( ) [ train. isnull( ) . sum ( ) > 0 ]
model 1
bodyType 4506
fuelType 8680
gearbox 5981
notRepairedDamage 24324
dtype: int64
test. isnull( ) . sum ( ) [ test. isnull( ) . sum ( ) > 0 ]
bodyType 1413
fuelType 2893
gearbox 1910
notRepairedDamage 8031
dtype: int64
3.2.1 处理训练集特征model的唯一缺失值
train[ train[ 'model' ] . isnull( ) ]
regDate model brand bodyType fuelType gearbox power kilometer notRepairedDamage regionCode price v_0 v_3 v_4 v_5 v_8 v_9 v_10 v_11 v_12 38424 20150809 NaN 37 6.0 1.0 1.0 190.0 2.0 0.0 1425 47950 41.139365 -7.275037 6.829352 0.181562 0.148487 0.222787 1.6757 -3.25056 0.876001
train[ ( train[ 'brand' ] == 37 ) &
( train[ 'bodyType' ] == 6.0 ) &
( train[ 'gearbox' ] == 1.0 ) &
( train[ 'power' ] == 190 ) ] [ 'model' ] . value_counts( )
157.0 17
199.0 16
202.0 8
200.0 1
Name: model, dtype: int64
train. loc[ 38424 , 'model' ] = 157.0
train. loc[ 38424 , : ]
regDate 2.015081e+07
model 1.570000e+02
brand 3.700000e+01
bodyType 6.000000e+00
fuelType 1.000000e+00
gearbox 1.000000e+00
power 1.900000e+02
kilometer 2.000000e+00
notRepairedDamage 0.000000e+00
regionCode 1.425000e+03
price 4.795000e+04
v_0 4.113937e+01
v_3 -7.275037e+00
v_4 6.829352e+00
v_5 1.815618e-01
v_8 1.484868e-01
v_9 2.227875e-01
v_10 1.675700e+00
v_11 -3.250560e+00
v_12 8.760013e-01
Name: 38424, dtype: float64
train. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 20 columns):
regDate 150000 non-null int64
model 150000 non-null float64
brand 150000 non-null int64
bodyType 145494 non-null float64
fuelType 141320 non-null float64
gearbox 144019 non-null float64
power 150000 non-null float64
kilometer 150000 non-null float64
notRepairedDamage 125676 non-null float64
regionCode 150000 non-null int64
price 150000 non-null int64
v_0 150000 non-null float64
v_3 150000 non-null float64
v_4 150000 non-null float64
v_5 150000 non-null float64
v_8 150000 non-null float64
v_9 150000 non-null float64
v_10 150000 non-null float64
v_11 150000 non-null float64
v_12 150000 non-null float64
dtypes: float64(16), int64(4)
memory usage: 22.9 MB
3.2.2 处理bodyType的缺失值
print ( train[ 'bodyType' ] . isnull( ) . value_counts( ) )
print ( '\n' )
print ( test[ 'bodyType' ] . isnull( ) . value_counts( ) )
False 145494
True 4506
Name: bodyType, dtype: int64
False 48587
True 1413
Name: bodyType, dtype: int64
sns. regplot( train[ 'bodyType' ] , train[ 'price' ] )
print ( train[ 'bodyType' ] . value_counts( ) )
print ( '\n' )
print ( test[ 'bodyType' ] . value_counts( ) )
0.0 41420
1.0 35272
2.0 30324
3.0 13491
4.0 9609
5.0 7607
6.0 6482
7.0 1289
Name: bodyType, dtype: int64
0.0 13985
1.0 11882
2.0 9900
3.0 4433
4.0 3303
5.0 2537
6.0 2116
7.0 431
Name: bodyType, dtype: int64
train. loc[ : , 'bodyType' ] = train[ 'bodyType' ] . map ( lambda x: 0.0 if pd. isnull( x) else x)
test. loc[ : , 'bodyType' ] = test[ 'bodyType' ] . map ( lambda x: 0.0 if pd. isnull( x) else x)
3.2.3 处理fuelType缺失值
print ( train[ 'fuelType' ] . isnull( ) . value_counts( ) )
print ( '\n' )
print ( test[ 'fuelType' ] . isnull( ) . value_counts( ) )
False 141320
True 8680
Name: fuelType, dtype: int64
False 47107
True 2893
Name: fuelType, dtype: int64
sns. regplot( train[ 'fuelType' ] , train[ 'price' ] )
dict_enu_train, dict_enu_test = { } , { }
for i in [ 0.0 , 1.0 , 2.0 , 3.0 , 4.0 , 5.0 , 6.0 , 7.0 ] :
dict_enu_train[ i] = train[ train[ 'bodyType' ] == i] [ 'fuelType' ] . mode( ) [ 0 ]
dict_enu_test[ i] = test[ test[ 'bodyType' ] == i] [ 'fuelType' ] . mode( ) [ 0 ]
dict_index_train, dict_index_test = { } , { }
for bodytype in [ 0.0 , 1.0 , 2.0 , 3.0 , 4.0 , 5.0 , 6.0 , 7.0 ] :
dict_index_train[ bodytype] = train[ ( train[ 'bodyType' ] == bodytype) & ( train[ 'fuelType' ] . isnull( ) ) ] . index. tolist( )
dict_index_test[ bodytype] = test[ ( test[ 'bodyType' ] == bodytype) & ( test[ 'fuelType' ] . isnull( ) ) ] . index. tolist( )
for bt, ft in dict_enu_train. items( ) :
train. loc[ dict_index_train[ bt] , 'fuelType' ] = ft
test. loc[ dict_index_test[ bt] , 'fuelType' ] = ft
3.2.4 填充gearbox的缺失值
print ( train[ 'gearbox' ] . isnull( ) . value_counts( ) )
print ( '\n' )
print ( test[ 'gearbox' ] . isnull( ) . value_counts( ) )
False 144019
True 5981
Name: gearbox, dtype: int64
False 48090
True 1910
Name: gearbox, dtype: int64
sns. regplot( train[ 'gearbox' ] , train[ 'price' ] )
print ( train[ 'gearbox' ] . value_counts( ) )
print ( '\n' )
print ( test[ 'gearbox' ] . value_counts( ) )
0.0 111623
1.0 32396
Name: gearbox, dtype: int64
0.0 37301
1.0 10789
Name: gearbox, dtype: int64
train. loc[ : , 'gearbox' ] = train[ 'gearbox' ] . map ( lambda x: 0.0 if pd. isnull( x) else x)
test. loc[ : , 'gearbox' ] = test[ 'gearbox' ] . map ( lambda x: 0.0 if pd. isnull( x) else x)
train. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 20 columns):
regDate 150000 non-null int64
model 150000 non-null float64
brand 150000 non-null int64
bodyType 150000 non-null float64
fuelType 150000 non-null float64
gearbox 150000 non-null float64
power 150000 non-null float64
kilometer 150000 non-null float64
notRepairedDamage 125676 non-null float64
regionCode 150000 non-null int64
price 150000 non-null int64
v_0 150000 non-null float64
v_3 150000 non-null float64
v_4 150000 non-null float64
v_5 150000 non-null float64
v_8 150000 non-null float64
v_9 150000 non-null float64
v_10 150000 non-null float64
v_11 150000 non-null float64
v_12 150000 non-null float64
dtypes: float64(16), int64(4)
memory usage: 22.9 MB
test. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 19 columns):
regDate 50000 non-null int64
model 50000 non-null float64
brand 50000 non-null int64
bodyType 50000 non-null float64
fuelType 50000 non-null float64
gearbox 50000 non-null float64
power 50000 non-null float64
kilometer 50000 non-null float64
notRepairedDamage 41969 non-null float64
regionCode 50000 non-null int64
v_0 50000 non-null float64
v_3 50000 non-null float64
v_4 50000 non-null float64
v_5 50000 non-null float64
v_8 50000 non-null float64
v_9 50000 non-null float64
v_10 50000 non-null float64
v_11 50000 non-null float64
v_12 50000 non-null float64
dtypes: float64(16), int64(3)
memory usage: 7.2 MB
3.2.4 最后,处理notRepairedDamage缺失值
print ( train[ 'notRepairedDamage' ] . isnull( ) . value_counts( ) )
print ( '\n' )
print ( test[ 'notRepairedDamage' ] . isnull( ) . value_counts( ) )
False 125676
True 24324
Name: notRepairedDamage, dtype: int64
False 41969
True 8031
Name: notRepairedDamage, dtype: int64
print ( train[ 'notRepairedDamage' ] . value_counts( ) )
print ( '\n' )
print ( test[ 'notRepairedDamage' ] . value_counts( ) )
0.0 111361
1.0 14315
Name: notRepairedDamage, dtype: int64
0.0 37249
1.0 4720
Name: notRepairedDamage, dtype: int64
train[ [ 'notRepairedDamage' , 'price' ] ] . corr( ) [ 'price' ]
notRepairedDamage -0.190623
price 1.000000
Name: price, dtype: float64
sns. regplot( train[ 'notRepairedDamage' ] , train[ 'price' ] )
train. loc[ : , 'notRepairedDamage' ] = train[ 'notRepairedDamage' ] . map ( lambda x: 0.0 if pd. isnull( x) else x)
test. loc[ : , 'notRepairedDamage' ] = test[ 'notRepairedDamage' ] . map ( lambda x: 0.0 if pd. isnull( x) else x)
train. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 20 columns):
regDate 150000 non-null int64
model 150000 non-null float64
brand 150000 non-null int64
bodyType 150000 non-null float64
fuelType 150000 non-null float64
gearbox 150000 non-null float64
power 150000 non-null float64
kilometer 150000 non-null float64
notRepairedDamage 150000 non-null float64
regionCode 150000 non-null int64
price 150000 non-null int64
v_0 150000 non-null float64
v_3 150000 non-null float64
v_4 150000 non-null float64
v_5 150000 non-null float64
v_8 150000 non-null float64
v_9 150000 non-null float64
v_10 150000 non-null float64
v_11 150000 non-null float64
v_12 150000 non-null float64
dtypes: float64(16), int64(4)
memory usage: 22.9 MB
test. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 19 columns):
regDate 50000 non-null int64
model 50000 non-null float64
brand 50000 non-null int64
bodyType 50000 non-null float64
fuelType 50000 non-null float64
gearbox 50000 non-null float64
power 50000 non-null float64
kilometer 50000 non-null float64
notRepairedDamage 50000 non-null float64
regionCode 50000 non-null int64
v_0 50000 non-null float64
v_3 50000 non-null float64
v_4 50000 non-null float64
v_5 50000 non-null float64
v_8 50000 non-null float64
v_9 50000 non-null float64
v_10 50000 non-null float64
v_11 50000 non-null float64
v_12 50000 non-null float64
dtypes: float64(16), int64(3)
memory usage: 7.2 MB
4. 建模与调参
4.1 选择三个集成学习模型:随机森林,XGBoost, 梯度提升树GBDT
rf = RandomForestRegressor( n_estimators= 100 , max_depth= 8 , random_state= 1 )
xgb = XGBRegressor( n_stimators= 150 , max_depth= 8 , learning_rate= 0.1 , random_state= 1 )
gbdt = GradientBoostingRegressor( subsample= 0.8 , random_state= 1 )
X = train. drop( [ 'price' ] , axis= 1 )
y = train[ 'price' ]
4.2 交叉验证,观察模型表现
score_rf = - 1 * cross_val_score( rf,
X,
y,
scoring= 'neg_mean_absolute_error' ,
cv= 5 ) . mean( )
print ( '随机森林模型的平均MAE为:' , score_rf)
score_xgb = - 1 * cross_val_score( xgb,
X,
y,
scoring= 'neg_mean_absolute_error' ,
cv= 5 ) . mean( )
print ( 'XGBoost模型的平均MAE为:' , score_xgb)
score_gbdt = - 1 * cross_val_score( gbdt,
X,
y,
scoring= 'neg_mean_absolute_error' ,
cv= 5 ) . mean( )
print ( '梯度提升树模型的平均MAE为:' , score_gbdt)
随机森林模型的平均MAE为: 924.43649869
XGBoost模型的平均MAE为: 616.449663619
梯度提升树模型的平均MAE为: 893.439059092
4.3 选中XGBoost模型,开始调参(网格搜索)
params = { 'n_estimators' : [ 150 , 200 , 250 ] ,
'learning_rate' : [ 0.1 ] ,
'subsample' : [ 0.5 , 0.8 ] }
model = GridSearchCV( estimator= xgb,
param_grid= params,
scoring= 'neg_mean_absolute_error' ,
cv= 3 )
model. fit( X, y)
print ( '最佳参数为:\n' , model. best_params_)
print ( '最佳分数为:\n' , model. best_score_)
print ( '最佳模型为:\n' , model. best_estimator_)
最佳参数为:
{'learning_rate': 0.1, 'n_estimators': 250, 'subsample': 0.8}
最佳分数为:
-587.043780247
最佳模型为:
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
max_depth=8, min_child_weight=1, missing=None, n_estimators=250,
n_jobs=1, n_stimators=150, nthread=None, objective='reg:linear',
random_state=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
seed=None, silent=True, subsample=0.8)
5. 提交结果
predictions = model. predict( test)
result = pd. DataFrame( { 'SaleID' : df_test[ 'SaleID' ] , 'price' : predictions} )
result. to_csv( '/home/myspace/My_submission.csv' , index= False )