二手车交易价格预测-CSDN博客

本文链接：https://blog.csdn.net/qq_38663663/article/details/112555878

该博客围绕数据处理与建模展开。先对数据进行初步检查，转换‘notRepairedDamage’列类型、删除无帮助特征；接着进行探索性数据分析，因数据量大选用函数和赛题描述完成；然后开展特征工程，修正power值、填充缺失值；再选择随机森林、XGBoost、GBDT三个模型，交叉验证后选中XGBoost调参；最后提交结果。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

# 导入相关库及配置
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score, GridSearchCV  # 交叉验证，网格搜索
pd.options.display.max_columns = None  # 取消最大列显示限制
warnings.filterwarnings('ignore')  # 过滤警告信息，保证清爽输出
%matplotlib inline

# 数据的读取和初步处理
df_train = pd.read_csv('datalab/231784/used_car_train_20200313.csv', sep=' ')
df_test = pd.read_csv('datalab/231784/used_car_testA_20200313.csv', sep=' ')
train = df_train.drop(['SaleID'], axis=1)
test = df_test.drop(['SaleID'], axis=1)

1. 数据初瞥

train.head()

	name	regDate	model	brand	bodyType	gearbox	power	kilometer	notRepairedDamage	regionCode	creatDate	price	v_0	v_1	v_2	v_3	v_4	v_5	v_6	v_7	v_8	v_9	v_10	v_11	v_12	v_13	v_14
0	736	20040402	30.0	6	1.0	0.0	60	12.5	0.0	1046	20160404	1850	43.357796	3.966344	0.050257	2.159744	1.143786	0.235676	0.101988	0.129549	0.022816	0.097462	-2.881803	2.804097	-2.420821	0.795292	0.914762
1	2262	20030301	40.0	1	2.0	0.0	0	15.0	-	4366	20160309	3600	45.305273	5.236112	0.137925	1.380657	-1.422165	0.264777	0.121004	0.135731	0.026597	0.020582	-4.900482	2.096338	-1.030483	-1.722674	0.245522
2	14874	20040403	115.0	15	1.0	0.0	163	12.5	0.0	2806	20160402	6222	45.978359	4.823792	1.319524	-0.998467	-0.996911	0.251410	0.114912	0.165147	0.062173	0.027075	-4.846749	1.803559	1.565330	-0.832687	-0.229963
3	71865	19960908	109.0	10	0.0	1.0	193	15.0	0.0	434	20160312	2400	45.687478	4.492574	-0.050616	0.883600	-2.228079	0.274293	0.110300	0.121964	0.033395	0.000000	-4.509599	1.285940	-0.501868	-2.438353	-0.478699
4	111080	20120103	110.0	5	1.0	0.0	68	5.0	0.0	6977	20160313	5200	44.383511	2.031433	0.572169	-1.571239	2.246088	0.228036	0.073205	0.091880	0.078819	0.121534	-1.896240	0.910783	0.931110	2.834518	1.923482

test.head()

	name	regDate	model	brand	bodyType	fuelType	gearbox	power	kilometer	notRepairedDamage	regionCode	creatDate	v_0	v_1	v_2	v_3	v_4	v_5	v_6	v_7	v_8	v_9	v_10	v_11	v_12	v_13	v_14
0	66932	20111212	222.0	4	5.0	1.0	1.0	313	15.0	0.0	1440	20160329	49.593127	5.246568	1.001130	-4.122264	0.737532	0.264405	0.121800	0.070899	0.106558	0.078867	-7.050969	-0.854626	4.800151	0.620011	-3.664654
1	174960	19990211	19.0	21	0.0	0.0	0.0	75	12.5	1.0	5419	20160404	42.395926	-3.253950	-1.753754	3.646605	-0.725597	0.261745	0.000000	0.096733	0.013705	0.052383	3.679418	-0.729039	-3.796107	-1.541230	-0.757055
2	5356	20090304	82.0	21	0.0	0.0	0.0	109	7.0	0.0	5045	20160308	45.841370	4.704178	0.155391	-1.118443	-0.229160	0.260216	0.112081	0.078082	0.062078	0.050540	-4.926690	1.001106	0.826562	0.138226	0.754033
3	50688	20100405	0.0	0	0.0	0.0	1.0	160	7.0	0.0	4023	20160325	46.440649	4.319155	0.428897	-2.037916	-0.234757	0.260466	0.106727	0.081146	0.075971	0.048268	-4.864637	0.505493	1.870379	0.366038	1.312775
4	161428	19970703	26.0	14	2.0	0.0	0.0	75	15.0	0.0	3103	20160309	42.184604	-3.166234	-1.572058	2.604143	0.387498	0.250999	0.000000	0.077806	0.028600	0.081709	3.616475	-0.673236	-3.197685	-0.025678	-0.101290

# 查看总览 - 训练集
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 30 columns):
name                 150000 non-null int64
regDate              150000 non-null int64
model                149999 non-null float64
brand                150000 non-null int64
bodyType             145494 non-null float64
fuelType             141320 non-null float64
gearbox              144019 non-null float64
power                150000 non-null int64
kilometer            150000 non-null float64
notRepairedDamage    150000 non-null object
regionCode           150000 non-null int64
seller               150000 non-null int64
offerType            150000 non-null int64
creatDate            150000 non-null int64
price                150000 non-null int64
v_0                  150000 non-null float64
v_1                  150000 non-null float64
v_2                  150000 non-null float64
v_3                  150000 non-null float64
v_4                  150000 non-null float64
v_5                  150000 non-null float64
v_6                  150000 non-null float64
v_7                  150000 non-null float64
v_8                  150000 non-null float64
v_9                  150000 non-null float64
v_10                 150000 non-null float64
v_11                 150000 non-null float64
v_12                 150000 non-null float64
v_13                 150000 non-null float64
v_14                 150000 non-null float64
dtypes: float64(20), int64(9), object(1)
memory usage: 34.3+ MB

# 查看总览 - 测试集
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 29 columns):
name                 50000 non-null int64
regDate              50000 non-null int64
model                50000 non-null float64
brand                50000 non-null int64
bodyType             48587 non-null float64
fuelType             47107 non-null float64
gearbox              48090 non-null float64
power                50000 non-null int64
kilometer            50000 non-null float64
notRepairedDamage    50000 non-null object
regionCode           50000 non-null int64
seller               50000 non-null int64
offerType            50000 non-null int64
creatDate            50000 non-null int64
v_0                  50000 non-null float64
v_1                  50000 non-null float64
v_2                  50000 non-null float64
v_3                  50000 non-null float64
v_4                  50000 non-null float64
v_5                  50000 non-null float64
v_6                  50000 non-null float64
v_7                  50000 non-null float64
v_8                  50000 non-null float64
v_9                  50000 non-null float64
v_10                 50000 non-null float64
v_11                 50000 non-null float64
v_12                 50000 non-null float64
v_13                 50000 non-null float64
v_14                 50000 non-null float64
dtypes: float64(20), int64(8), object(1)
memory usage: 11.1+ MB

1.1 ‘notRepairedDamage’列是唯一的非数值型特征，只有0或1或’-’，应该转换数据类型，并将‘-’变为空值

# 转换'-'
train['notRepairedDamage'] = train['notRepairedDamage'].replace('-', np.nan) 
test['notRepairedDamage'] = test['notRepairedDamage'].replace('-', np.nan)

# 转换数据类型
train['notRepairedDamage'] = train['notRepairedDamage'].astype('float64')
test['notRepairedDamage'] = test['notRepairedDamage'].astype('float64')

# 检查是否转换成功
train['notRepairedDamage'].unique(), test['notRepairedDamage'].unique()

(array([  0.,  nan,   1.]), array([  0.,   1.,  nan]))

# 查看数值统计描述 - 测试集
test.describe()

	name	regDate	model	brand	bodyType	fuelType	gearbox	power	kilometer	notRepairedDamage	regionCode	seller	offerType	creatDate	v_0	v_1	v_2	v_3	v_4	v_5	v_6	v_7	v_8	v_9	v_10	v_11	v_12	v_13	v_14
count	50000.000000	5.000000e+04	50000.000000	50000.000000	48587.000000	47107.000000	48090.000000	50000.000000	50000.000000	41969.000000	50000.000000	50000.0	50000.0	5.000000e+04	50000.000000	50000.000000	50000.000000	50000.000000	50000.000000	50000.000000	50000.000000	50000.000000	50000.000000	50000.000000	50000.000000	50000.000000	50000.000000	50000.000000	50000.000000
mean	68542.223280	2.003393e+07	46.844520	8.056240	1.782185	0.373405	0.224350	119.883620	12.595580	0.112464	2590.604820	0.0	0.0	2.016033e+07	44.418233	-0.037238	0.050534	0.084640	0.015001	0.248669	0.045021	0.122744	0.057997	0.062000	-0.017855	-0.013742	-0.013554	-0.003147	0.001516
std	61052.808133	5.368870e+04	49.469548	7.819477	1.760736	0.546442	0.417158	185.097387	3.908979	0.315940	1876.970263	0.0	0.0	7.951521e+01	2.429950	3.642562	2.856341	2.026510	1.193026	0.044601	0.051766	0.195972	0.029211	0.035653	3.747985	3.231258	2.515962	1.286597	1.027360
min	0.000000	1.991000e+07	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.500000	0.000000	0.000000	0.0	0.0	2.015061e+07	28.987024	-4.137733	-4.205728	-5.638184	-4.287718	0.000000	0.000000	0.000000	0.000000	0.000000	-9.160049	-5.411964	-8.916949	-4.123333	-6.112667
25%	11203.500000	1.999091e+07	10.000000	1.000000	0.000000	0.000000	0.000000	75.000000	12.500000	0.000000	1030.000000	0.0	0.0	2.016031e+07	43.139621	-3.191909	-0.971266	-1.453453	-0.928089	0.243762	0.000044	0.062644	0.035084	0.033714	-3.700121	-1.971325	-1.876703	-1.060428	-0.437920
50%	52248.500000	2.003091e+07	29.000000	6.000000	1.000000	0.000000	0.000000	109.000000	15.000000	0.000000	2219.000000	0.0	0.0	2.016032e+07	44.611084	-3.050756	-0.388117	0.097881	-0.070225	0.257877	0.000815	0.095828	0.057084	0.058764	1.613212	-0.355843	-0.142779	-0.035956	0.138799
75%	118856.500000	2.007110e+07	65.000000	13.000000	3.000000	1.000000	0.000000	150.000000	15.000000	0.000000	3857.000000	0.0	0.0	2.016033e+07	45.992639	3.997323	0.240548	1.562700	0.863731	0.265328	0.102025	0.125438	0.079077	0.087489	2.832708	1.262914	1.764335	0.941469	0.681163
max	196805.000000	2.015121e+07	246.000000	39.000000	7.000000	6.000000	1.000000	20000.000000	15.000000	1.000000	8121.000000	0.0	0.0	2.016041e+07	51.751684	7.553517	18.394570	9.381599	5.270150	0.291618	0.153265	1.358813	0.156355	0.214775	12.338872	18.856218	12.950498	5.913273	2.624622

# 查看数值统计描述 - 训练集
train.describe()

	name	regDate	model	brand	bodyType	fuelType	gearbox	power	kilometer	notRepairedDamage	regionCode	seller	offerType	creatDate	price	v_0	v_1	v_2	v_3	v_4	v_5	v_6	v_7	v_8	v_9	v_10	v_11	v_12	v_13	v_14
count	150000.000000	1.500000e+05	149999.000000	150000.000000	145494.000000	141320.000000	144019.000000	150000.000000	150000.000000	125676.000000	150000.000000	150000.000000	150000.0	1.500000e+05	150000.000000	150000.000000	150000.000000	150000.000000	150000.000000	150000.000000	150000.000000	150000.000000	150000.000000	150000.000000	150000.000000	150000.000000	150000.000000	150000.000000	150000.000000	150000.000000
mean	68349.172873	2.003417e+07	47.129021	8.052733	1.792369	0.375842	0.224943	119.316547	12.597160	0.113904	2583.077267	0.000007	0.0	2.016033e+07	5923.327333	44.406268	-0.044809	0.080765	0.078833	0.017875	0.248204	0.044923	0.124692	0.058144	0.061996	-0.001000	0.009035	0.004813	0.000313	-0.000688
std	61103.875095	5.364988e+04	49.536040	7.864956	1.760640	0.548677	0.417546	177.168419	3.919576	0.317696	1885.363218	0.002582	0.0	1.067328e+02	7501.998477	2.457548	3.641893	2.929618	2.026514	1.193661	0.045804	0.051743	0.201410	0.029186	0.035692	3.772386	3.286071	2.517478	1.288988	1.038685
min	0.000000	1.991000e+07	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.500000	0.000000	0.000000	0.000000	0.0	2.015062e+07	11.000000	30.451976	-4.295589	-4.470671	-7.275037	-4.364565	0.000000	0.000000	0.000000	0.000000	0.000000	-9.168192	-5.558207	-9.639552	-4.153899	-6.546556
25%	11156.000000	1.999091e+07	10.000000	1.000000	0.000000	0.000000	0.000000	75.000000	12.500000	0.000000	1018.000000	0.000000	0.0	2.016031e+07	1300.000000	43.135799	-3.192349	-0.970671	-1.462580	-0.921191	0.243615	0.000038	0.062474	0.035334	0.033930	-3.722303	-1.951543	-1.871846	-1.057789	-0.437034
50%	51638.000000	2.003091e+07	30.000000	6.000000	1.000000	0.000000	0.000000	110.000000	15.000000	0.000000	2196.000000	0.000000	0.0	2.016032e+07	3250.000000	44.610266	-3.052671	-0.382947	0.099722	-0.075910	0.257798	0.000812	0.095866	0.057014	0.058484	1.624076	-0.358053	-0.130753	-0.036245	0.141246
75%	118841.250000	2.007111e+07	66.000000	13.000000	3.000000	1.000000	0.000000	150.000000	15.000000	0.000000	3843.000000	0.000000	0.0	2.016033e+07	7700.000000	46.004721	4.000670	0.241335	1.565838	0.868758	0.265297	0.102009	0.125243	0.079382	0.087491	2.844357	1.255022	1.776933	0.942813	0.680378
max	196812.000000	2.015121e+07	247.000000	39.000000	7.000000	6.000000	1.000000	19312.000000	15.000000	1.000000	8120.000000	1.000000	0.0	2.016041e+07	99999.000000	52.304178	7.320308	19.035496	9.854702	6.829352	0.291838	0.151420	1.404936	0.160791	0.222787	12.357011	18.819042	13.847792	11.147669	8.658418

1.2 发现seller特征在训练集和测试集中偏斜极其严重，对预测没有帮助，删去

train.drop(['seller'], axis=1, inplace=True)
test.drop(['seller'], axis=1, inplace=True)

1.3 意外发现两个数据集的offerType列全为0，删去。

train = train.drop(['offerType'], axis=1)
test = test.drop(['offerType'], axis=1)

train.shape, test.shape

((150000, 28), (50000, 27))

2. 探索性数据分析

2.1 用图表展示各特征与售价之间的数量关系（事实证明该图表的绘制非常耗时）

# fig = plt.figure(figsize=(10, 50))

# for i in range(len(train.columns)-1):  # 要减去price列
#     fig.add_subplot(10, 2, i+1)
#     sns.regplot(train.drop(['price'], axis=1).iloc[:, i], train['price'])

# plt.tight_layout()
# plt.show()

2.2 由于数据量过大，受性能限制很难用可视化工具展示数据分布的特征。因地制宜，选用函数及赛题数据描述来完成探索性数据分析

赛题数据描述讲到， power范围为[0, 600]，然而


# 有143个值不合法，需要用别的值替换
train[train['power'] > 600]['power'].count()

test[test['power'] > 600]['power'].count()

2.3 现在，特征工程能做的只是填充缺失值以及删除某些特征。在开始之前，先看看线性相关系数

# 查看各特征与销售价格之间的线性相关系数
train.corr().unstack()['price'].sort_values(ascending=False)

price                1.000000
v_12                 0.692823
v_8                  0.685798
v_0                  0.628397
regDate              0.611959
gearbox              0.329075
bodyType             0.241303
power                0.219834
fuelType             0.200536
v_5                  0.164317
model                0.136983
v_2                  0.085322
v_6                  0.068970
v_1                  0.060914
v_14                 0.035911
regionCode           0.014036
creatDate            0.002955
name                 0.002030
v_13                -0.013993
brand               -0.043799
v_7                 -0.053024
v_4                 -0.147085
notRepairedDamage   -0.190623
v_9                 -0.206205
v_10                -0.246175
v_11                -0.275320
kilometer           -0.440519
v_3                 -0.730946
dtype: float64

# 在选择需要删除的特征之前，考虑线性相关系数低的。第一步选中系数绝对值小于0.1的特征， 第二步，抛开线性相关系数，从现实角度思考每个特征对售价的影响

# 特征v_2, v_6, v_1, v_14, v_13, v_7：由于是连续型变量，理论上具有数学意义。既然跟售价的线性相关系数极低，为降低噪声，避免过拟合，考虑删去；

# 特征regionCode, brand：并非连续型变量，不具备数学上的可比较性。与售价的线性相关系数低无法说明各自的取值对售价影响不大，保留。

# 特征name：汽车交易名称，训练集共有99662条不重复值，取值不影响售价，删去。

# 特征creatDate：（二手）汽车开始售卖时间，范围在 [20150618, 20160407]，间隔短，且与regDate（汽车注册时间）线性相关系数仅为-0.001293，其取值显然对售价影响很小，删去。

2.4 删去特征，同时删去测试集中相应的特征

train.drop(['v_2', 'v_6', 'v_1', 'v_14', 'v_13', 'v_7', 'name', 'creatDate'], axis=1, inplace=True)
test.drop(['v_2', 'v_6', 'v_1', 'v_14', 'v_13', 'v_7', 'name', 'creatDate'], axis=1, inplace=True)

train.shape, test.shape

((150000, 20), (50000, 19))

# 再次查看各特征与销售价格之间的线性相关系数
train.corr().unstack()['price'].sort_values(ascending=False)

price                1.000000
v_12                 0.692823
v_8                  0.685798
v_0                  0.628397
regDate              0.611959
gearbox              0.329075
bodyType             0.241303
power                0.219834
fuelType             0.200536
v_5                  0.164317
model                0.136983
regionCode           0.014036
brand               -0.043799
v_4                 -0.147085
notRepairedDamage   -0.190623
v_9                 -0.206205
v_10                -0.246175
v_11                -0.275320
kilometer           -0.440519
v_3                 -0.730946
dtype: float64

3. 特征工程

3.1 修正特征power大于600的值

# 使用map函数，以power列的中位数来替换数值超出范围的power
train['power'] = train['power'].map(lambda x: train['power'].median() if x > 600 else x)
test['power'] = test['power'].map(lambda x: test['power'].median() if x > 600 else x)

# 检查是否替换成功
train['power'].plot.hist()

在这里插入图片描述

test['power'].plot.hist()

在这里插入图片描述

3.2 填充缺失值

# 查看训练集缺失值存在情况
train.isnull().sum()[train.isnull().sum() > 0]

model                    1
bodyType              4506
fuelType              8680
gearbox               5981
notRepairedDamage    24324
dtype: int64

# 查看测试集缺失值存在情况
test.isnull().sum()[test.isnull().sum() > 0]

bodyType             1413
fuelType             2893
gearbox              1910
notRepairedDamage    8031
dtype: int64

3.2.1 处理训练集特征model的唯一缺失值

train[train['model'].isnull()]

	regDate	model	brand	bodyType	fuelType	gearbox	power	kilometer	notRepairedDamage	regionCode	price	v_0	v_3	v_4	v_5	v_8	v_9	v_10	v_11	v_12
38424	20150809	NaN	37	6.0	1.0	1.0	190.0	2.0	0.0	1425	47950	41.139365	-7.275037	6.829352	0.181562	0.148487	0.222787	1.6757	-3.25056	0.876001

# model(车型编码)一般与brand, bodyType, gearbox, power有关，选择以上4个特征与该车相同的车辆的model，选择出现次数最多的值
train[(train['brand'] == 37) & 
      (train['bodyType'] == 6.0) & 
      (train['gearbox'] == 1.0) & 
      (train['power'] == 190)]['model'].value_counts()

157.0    17
199.0    16
202.0     8
200.0     1
Name: model, dtype: int64

# 用157.0填充缺失值
train.loc[38424, 'model'] = 157.0

train.loc[38424, :]

regDate              2.015081e+07
model                1.570000e+02
brand                3.700000e+01
bodyType             6.000000e+00
fuelType             1.000000e+00
gearbox              1.000000e+00
power                1.900000e+02
kilometer            2.000000e+00
notRepairedDamage    0.000000e+00
regionCode           1.425000e+03
price                4.795000e+04
v_0                  4.113937e+01
v_3                 -7.275037e+00
v_4                  6.829352e+00
v_5                  1.815618e-01
v_8                  1.484868e-01
v_9                  2.227875e-01
v_10                 1.675700e+00
v_11                -3.250560e+00
v_12                 8.760013e-01
Name: 38424, dtype: float64

# 查看填充结果
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 20 columns):
regDate              150000 non-null int64
model                150000 non-null float64
brand                150000 non-null int64
bodyType             145494 non-null float64
fuelType             141320 non-null float64
gearbox              144019 non-null float64
power                150000 non-null float64
kilometer            150000 non-null float64
notRepairedDamage    125676 non-null float64
regionCode           150000 non-null int64
price                150000 non-null int64
v_0                  150000 non-null float64
v_3                  150000 non-null float64
v_4                  150000 non-null float64
v_5                  150000 non-null float64
v_8                  150000 non-null float64
v_9                  150000 non-null float64
v_10                 150000 non-null float64
v_11                 150000 non-null float64
v_12                 150000 non-null float64
dtypes: float64(16), int64(4)
memory usage: 22.9 MB

3.2.2 处理bodyType的缺失值

# 看缺失值数量
print(train['bodyType'].isnull().value_counts())
print('\n')
print(test['bodyType'].isnull().value_counts())

False    145494
True       4506
Name: bodyType, dtype: int64


False    48587
True      1413
Name: bodyType, dtype: int64

# bodyType特征缺失值占比较小，先观察它的取值与售价之间的联系，再决定是否删去
# 输出特征与售价之间的线性关系图（类似散点图）
sns.regplot(train['bodyType'], train['price'])

在这里插入图片描述

# 可见不同车身类型的汽车售价差别还是比较大的，故保留该特征，填充缺失值
# 看看车身类型数量分布
print(train['bodyType'].value_counts())
print('\n')
print(test['bodyType'].value_counts())

0.0    41420
1.0    35272
2.0    30324
3.0    13491
4.0     9609
5.0     7607
6.0     6482
7.0     1289
Name: bodyType, dtype: int64


0.0    13985
1.0    11882
2.0     9900
3.0     4433
4.0     3303
5.0     2537
6.0     2116
7.0      431
Name: bodyType, dtype: int64

# 在两个数据集上，车身类型为0.0（豪华轿车）的汽车数量都是最多，所以用0.0来填充缺失值
train.loc[:, 'bodyType'] = train['bodyType'].map(lambda x: 0.0 if pd.isnull(x) else x)
test.loc[:, 'bodyType'] = test['bodyType'].map(lambda x: 0.0 if pd.isnull(x) else x)

3.2.3 处理fuelType缺失值

# 看缺失值数量
print(train['fuelType'].isnull().value_counts())
print('\n')
print(test['fuelType'].isnull().value_counts())

False    141320
True       8680
Name: fuelType, dtype: int64


False    47107
True      2893
Name: fuelType, dtype: int64

# fuel特征缺失值占比较小，先观察它的取值与售价之间的联系，再决定是否删去
# 输出特征与售价之间的线性关系图（类似散点图）
sns.regplot(train['fuelType'], train['price'])

在这里插入图片描述

# 猜想：燃油类型与车身类型相关，如豪华轿车更可能是汽油或电动， 而搅拌车大多是柴油
# 创建字典，保存不同bodyType下， fuelType的众数，并以此填充fuelTyp的缺失值
dict_enu_train, dict_enu_test = {}, {}
for i in [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]:
    dict_enu_train[i] = train[train['bodyType'] == i]['fuelType'].mode()[0]
    dict_enu_test[i] = test[test['bodyType'] == i]['fuelType'].mode()[0]
    
# 发现dict_enu_train, dict_enu_test是一样的内容

# 开始填充fuelType缺失值
# 在含fuelType缺失值的条目中，将不同bodyType对应的index输出保存到一个字典中
dict_index_train, dict_index_test = {}, {}

for bodytype in [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]:
    dict_index_train[bodytype] = train[(train['bodyType'] == bodytype) & (train['fuelType'].isnull())].index.tolist()
    dict_index_test[bodytype] = test[(test['bodyType'] == bodytype) & (test['fuelType'].isnull())].index.tolist()

# 分别对每个bodyTYpe所对应的index来填充fuelType列
for bt, ft in dict_enu_train.items():
#     train.loc[tuple(dict_index[bt]), :]['fuelType'] = ft  # 注意：链式索引 (chained indexing)很可能导致赋值失败！
    train.loc[dict_index_train[bt], 'fuelType'] = ft  # Pandas推荐使用这种方法来索引/赋值
    test.loc[dict_index_test[bt], 'fuelType'] = ft

3.2.4 填充gearbox的缺失值

# 看缺失值数量
print(train['gearbox'].isnull().value_counts())
print('\n')
print(test['gearbox'].isnull().value_counts())

False    144019
True       5981
Name: gearbox, dtype: int64


False    48090
True      1910
Name: gearbox, dtype: int64

# gearbox特征缺失值占比较小，先观察它的取值与售价之间的联系，再决定是否删去
# 输出特征与售价之间的线性关系图（类似散点图）
sns.regplot(train['gearbox'], train['price'])

在这里插入图片描述

# 可见变速箱类型的不同不会显著影响售价，删去测试集中带缺失值的行或许是可行的做法，但为避免样本量减少带来的过拟合，还是决定保留该特征并填充其缺失值
# 看看车身类型数量分布
print(train['gearbox'].value_counts())
print('\n')
print(test['gearbox'].value_counts())

0.0    111623
1.0     32396
Name: gearbox, dtype: int64


0.0    37301
1.0    10789
Name: gearbox, dtype: int64

# 训练集
train.loc[:, 'gearbox'] = train['gearbox'].map(lambda x: 0.0 if pd.isnull(x) else x)

# # 对于测试集，为保证预测结果完整性，不能删去任何行。测试集仅有1910个gearbox缺失值，用数量占绝大多数的0.0（手动档）来填充缺失值
test.loc[:, 'gearbox'] = test['gearbox'].map(lambda x: 0.0 if pd.isnull(x) else x)

# 检查填充是否成功
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 20 columns):
regDate              150000 non-null int64
model                150000 non-null float64
brand                150000 non-null int64
bodyType             150000 non-null float64
fuelType             150000 non-null float64
gearbox              150000 non-null float64
power                150000 non-null float64
kilometer            150000 non-null float64
notRepairedDamage    125676 non-null float64
regionCode           150000 non-null int64
price                150000 non-null int64
v_0                  150000 non-null float64
v_3                  150000 non-null float64
v_4                  150000 non-null float64
v_5                  150000 non-null float64
v_8                  150000 non-null float64
v_9                  150000 non-null float64
v_10                 150000 non-null float64
v_11                 150000 non-null float64
v_12                 150000 non-null float64
dtypes: float64(16), int64(4)
memory usage: 22.9 MB

test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 19 columns):
regDate              50000 non-null int64
model                50000 non-null float64
brand                50000 non-null int64
bodyType             50000 non-null float64
fuelType             50000 non-null float64
gearbox              50000 non-null float64
power                50000 non-null float64
kilometer            50000 non-null float64
notRepairedDamage    41969 non-null float64
regionCode           50000 non-null int64
v_0                  50000 non-null float64
v_3                  50000 non-null float64
v_4                  50000 non-null float64
v_5                  50000 non-null float64
v_8                  50000 non-null float64
v_9                  50000 non-null float64
v_10                 50000 non-null float64
v_11                 50000 non-null float64
v_12                 50000 non-null float64
dtypes: float64(16), int64(3)
memory usage: 7.2 MB

3.2.4 最后，处理notRepairedDamage缺失值

# 看缺失值数量
# 缺失值数量在两个数据集中的占比都不低
print(train['notRepairedDamage'].isnull().value_counts())
print('\n')
print(test['notRepairedDamage'].isnull().value_counts())

False    125676
True      24324
Name: notRepairedDamage, dtype: int64


False    41969
True      8031
Name: notRepairedDamage, dtype: int64

# 查看数量分布
print(train['notRepairedDamage'].value_counts())
print('\n')
print(test['notRepairedDamage'].value_counts())

0.0    111361
1.0     14315
Name: notRepairedDamage, dtype: int64


0.0    37249
1.0     4720
Name: notRepairedDamage, dtype: int64

# 查看线性相关系数
train[['notRepairedDamage', 'price']].corr()['price']

notRepairedDamage   -0.190623
price                1.000000
Name: price, dtype: float64

# 在输出特征与售价之间的线性关系图（类似散点图）
sns.regplot(train['notRepairedDamage'], train['price'])

在这里插入图片描述

# 很奇怪，在整个训练集上有尚未修复损坏的汽车比损坏已修复的汽车售价还要高。考虑到剩余接近20个特征的存在，这应该是巧合
# 为简单化问题，仍使用数量占比最大的0.0来填充所有缺失值
train.loc[:, 'notRepairedDamage'] = train['notRepairedDamage'].map(lambda x: 0.0 if pd.isnull(x) else x)
test.loc[:, 'notRepairedDamage'] = test['notRepairedDamage'].map(lambda x: 0.0 if pd.isnull(x) else x)

# 最后。检查填充结果
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 20 columns):
regDate              150000 non-null int64
model                150000 non-null float64
brand                150000 non-null int64
bodyType             150000 non-null float64
fuelType             150000 non-null float64
gearbox              150000 non-null float64
power                150000 non-null float64
kilometer            150000 non-null float64
notRepairedDamage    150000 non-null float64
regionCode           150000 non-null int64
price                150000 non-null int64
v_0                  150000 non-null float64
v_3                  150000 non-null float64
v_4                  150000 non-null float64
v_5                  150000 non-null float64
v_8                  150000 non-null float64
v_9                  150000 non-null float64
v_10                 150000 non-null float64
v_11                 150000 non-null float64
v_12                 150000 non-null float64
dtypes: float64(16), int64(4)
memory usage: 22.9 MB

test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 19 columns):
regDate              50000 non-null int64
model                50000 non-null float64
brand                50000 non-null int64
bodyType             50000 non-null float64
fuelType             50000 non-null float64
gearbox              50000 non-null float64
power                50000 non-null float64
kilometer            50000 non-null float64
notRepairedDamage    50000 non-null float64
regionCode           50000 non-null int64
v_0                  50000 non-null float64
v_3                  50000 non-null float64
v_4                  50000 non-null float64
v_5                  50000 non-null float64
v_8                  50000 non-null float64
v_9                  50000 non-null float64
v_10                 50000 non-null float64
v_11                 50000 non-null float64
v_12                 50000 non-null float64
dtypes: float64(16), int64(3)
memory usage: 7.2 MB

4. 建模与调参

4.1 选择三个集成学习模型：随机森林，XGBoost, 梯度提升树GBDT

rf = RandomForestRegressor(n_estimators=100, max_depth=8, random_state=1) 
xgb = XGBRegressor(n_stimators=150, max_depth=8, learning_rate=0.1, random_state=1)  
gbdt = GradientBoostingRegressor(subsample=0.8, random_state=1)  # subsample小于1可降低方差，但会加大偏差

X = train.drop(['price'], axis=1)
y = train['price']

4.2 交叉验证，观察模型表现

#随机森林
score_rf = -1 * cross_val_score(rf,
                           X,
                           y,
                           scoring='neg_mean_absolute_error',
                           cv=5).mean()  # 取得分均值

print('随机森林模型的平均MAE为：', score_rf)

# XGBoost
score_xgb = -1 * cross_val_score(xgb,
                                X,
                                y,
                                scoring='neg_mean_absolute_error',
                                cv=5).mean()  # 取得分均值

print('XGBoost模型的平均MAE为：', score_xgb)

# 梯度提升树GBDT
score_gbdt = -1 * cross_val_score(gbdt,
                                X,
                                y,
                                scoring='neg_mean_absolute_error',
                                cv=5).mean()  # 取得分均值

print('梯度提升树模型的平均MAE为：', score_gbdt)

随机森林模型的平均MAE为： 924.43649869
XGBoost模型的平均MAE为： 616.449663619
梯度提升树模型的平均MAE为： 893.439059092

4.3 选中XGBoost模型，开始调参（网格搜索）

params = {'n_estimators': [150, 200, 250],
          'learning_rate': [0.1],
          'subsample': [0.5, 0.8]}

model = GridSearchCV(estimator=xgb,
                    param_grid=params,
                    scoring='neg_mean_absolute_error',
                    cv=3)
model.fit(X, y)

# 输出最佳参数
print('最佳参数为：\n', model.best_params_)
print('最佳分数为：\n', model.best_score_)
print('最佳模型为：\n', model.best_estimator_)

最佳参数为：
 {'learning_rate': 0.1, 'n_estimators': 250, 'subsample': 0.8}
最佳分数为：
 -587.043780247
最佳模型为：
 XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=8, min_child_weight=1, missing=None, n_estimators=250,
       n_jobs=1, n_stimators=150, nthread=None, objective='reg:linear',
       random_state=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=0.8)

5. 提交结果

predictions = model.predict(test)
result = pd.DataFrame({'SaleID': df_test['SaleID'], 'price': predictions})
result.to_csv('/home/myspace/My_submission.csv', index=False)