河北高校邀请赛——二手车交易价格预测-Task2 EDA

EDA
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from lightgbm.sklearn import LGBMRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import time
#导入数据
Train_data = pd.read_csv('car_train_0110.csv', sep=' ')
Test_data = pd.read_csv('car_testA_0110.csv', sep=' ')
# 合并方便后面的操作
df = pd.concat([Train_data, Test_data], ignore_index=True)
# 看下数据的情况,前后各5行
Train_data.head().append(Train_data.tail())
SaleIDnameregDatemodelbrandbodyTypefuelTypegearboxpowerkilometer...v_14v_15v_16v_17v_18v_19v_20v_21v_22v_23
01348907342016000213.09NaN0.01.0015.0...0.0921390.00000018.763832-1.512063-1.008718-12.100623-0.9470529.0772970.5812143.945923
13066481969732008030772.097.05.01.017315.0...0.0010700.122335-5.685612-0.489963-2.223693-0.226865-0.658246-3.9496214.593618-1.145653
2340675253472002031218.0123.00.01.05012.5...0.0644100.003345-3.2957001.8164993.554439-0.6836750.9714952.625318-0.851922-1.246135
35733253822000061138.087.00.01.05415.0...0.0692310.000000-3.4055211.4978264.7826360.0391011.2276463.040629-0.801854-1.251894
42652351731742003010987.005.05.01.01313.0...0.0000990.001655-4.4754290.1241381.364567-0.319848-1.131568-3.303424-1.998466-1.279368
2499951055693322017000313.09NaNNaN1.05815.0...0.0791190.00144711.78250820.402576-2.7227720.462388-4.4293857.8834130.698405-1.082013
2499961467101021102003051129.0173.00.00.06115.0...0.0000000.002342-2.9882721.5005323.502201-0.761715-2.484556-2.532968-0.940266-1.106426
2499971160668280220130312124.0166.00.01.01223.0...0.0033580.100760-6.939560-1.144959-5.3379490.896026-0.592565-3.8727252.1359843.807554
249998900826597120121212111.047.05.00.01849.0...0.0029740.008251-7.222167-1.383696-5.402794-0.409451-1.891556-3.104789-3.7773743.186218
24999976453569542005111113.093.00.01.05812.5...0.0000000.00907110.491312-11.270043-0.272595-0.026478-2.168249-0.980042-0.955164-1.169593

10 rows × 40 columns

Test_data.head().append(Train_data.tail())
SaleIDnameregDatemodelbrandbodyTypefuelTypegearboxpowerkilometer...v_15v_16v_17v_18v_19v_20v_21v_22v_23price
07203265052006050519.0137.00.01.0908.0...0.105382-5.9989930.147048-1.9028470.3489902.3249613.3439104.048742-1.431822NaN
17143161836200103015.053.04.01.07515.0...0.000000-3.2872212.0813172.937052-0.1230181.2023953.570743-1.180587-1.348598NaN
2704693212291201706106.018NaN5.00.015015.0...0.0000004.3682188.252188-4.136109-13.334970-4.444620-0.706978-1.7202183.569112NaN
3624972134519820005215.0327.00.01.006.0...0.100883-2.5374860.5139554.4149620.3576852.7007325.3236026.085956-0.900585NaN
466975314282006020530.047.05.01.012215.0...0.002509-6.197633-0.191814-1.224360-0.3269852.2549314.183037-2.5740040.014203NaN
2499951055693322017000313.09NaNNaN1.05815.0...0.00144711.78250820.402576-2.7227720.462388-4.4293857.8834130.698405-1.0820131200.0
2499961467101021102003051129.0173.00.00.06115.0...0.002342-2.9882721.5005323.502201-0.761715-2.484556-2.532968-0.940266-1.1064261200.0
2499971160668280220130312124.0166.00.01.01223.0...0.100760-6.939560-1.144959-5.3379490.896026-0.592565-3.8727252.1359843.80755416500.0
249998900826597120121212111.047.05.00.01849.0...0.008251-7.222167-1.383696-5.402794-0.409451-1.891556-3.104789-3.7773743.18621831950.0
24999976453569542005111113.093.00.01.05812.5...0.00907110.491312-11.270043-0.272595-0.026478-2.168249-0.980042-0.955164-1.1695931990.0

10 rows × 40 columns

# 查看总体情况,个数,平均值,方差等
Train_data.describe()
SaleIDnameregDatemodelbrandbodyTypefuelTypegearboxpowerkilometer...v_14v_15v_16v_17v_18v_19v_20v_21v_22v_23
count250000.000000250000.0000002.500000e+05250000.000000250000.000000224620.000000227510.000000236487.000000250000.000000250000.000000...250000.000000250000.000000250000.000000250000.000000250000.000000250000.000000250000.000000250000.000000250000.000000250000.000000
mean185351.79076883153.3621722.003401e+0744.9114807.7852364.5632711.6650080.780783115.52841212.577418...0.0324890.0304080.0147250.0009150.0062730.006604-0.0013740.000609-0.0040250.001834
std107121.18876372540.7999647.770250e+0450.6400817.6940101.9125152.3396460.413717196.1418283.990632...0.0387920.0493338.7791635.7710814.8809814.1247223.8036263.5553532.8647132.323680
min1.0000000.0000001.910000e+070.0000000.0000000.0000000.0000000.0000000.0000000.500000...0.0000000.000000-10.412444-15.538236-21.009214-13.989955-9.599285-11.181255-7.671327-2.350888
25%92501.75000014500.0000001.999061e+076.0000001.0000003.0000000.0000001.00000070.00000012.500000...0.0001290.000000-5.552269-0.901181-3.150385-0.478173-1.727237-3.067073-2.092178-1.402804
50%185264.50000065314.5000002.003111e+0727.0000006.0000004.0000000.0000001.000000105.00000015.000000...0.0019610.002567-3.8217700.223181-0.0585020.038427-0.995044-0.880587-1.199807-1.145588
75%278128.500000143761.2500002.008081e+0770.00000011.0000007.0000005.0000001.000000150.00000015.000000...0.0756720.0565683.5997471.2637372.8004750.5691981.5633823.2699872.7376140.044865
max370946.000000233044.0000002.019121e+07250.00000039.0000007.0000006.0000001.00000020000.00000015.000000...0.1307850.18434036.75687826.13456123.05566016.57602720.32457214.0394228.7645978.574730

8 rows × 40 columns

Test_data.describe()
SaleIDnameregDatemodelbrandbodyTypefuelTypegearboxpowerkilometer...v_14v_15v_16v_17v_18v_19v_20v_21v_22v_23
count50000.00000050000.0000005.000000e+0450000.00000050000.00000044890.00000045598.00000047287.00000050000.00000050000.000000...50000.00000050000.00000050000.00000050000.00000050000.00000050000.00000050000.00000050000.00000050000.00000050000.000000
mean556029.05338082878.2514202.003441e+0744.9228407.7794204.5562261.6811920.781081114.11606012.555210...0.0325700.030773-0.0248190.007051-0.008488-0.0301040.014609-0.0033530.013125-0.011936
std106952.40256572292.0769367.788055e+0450.5762557.6616671.9082912.3448290.413518177.2741544.034901...0.0387790.0495218.7596635.7842994.8252614.1005613.8126673.5489442.8667742.316144
min370951.0000000.0000001.910000e+070.0000000.0000000.0000000.0000000.0000000.0000000.500000...0.0000000.000000-10.196998-15.167961-21.925773-13.682825-9.282567-11.117367-6.365723-2.394516
25%463258.50000014121.2500001.999061e+076.0000001.0000003.0000000.0000001.00000069.00000012.500000...0.0001350.000000-5.575131-0.891030-3.105073-0.481952-1.697763-3.069575-2.089326-1.402958
50%556296.00000065359.0000002.003111e+0727.0000006.0000004.0000000.0000001.000000105.00000015.000000...0.0019490.002593-3.8375720.221379-0.0818360.039376-0.971210-0.877377-1.192502-1.146398
75%648862.250000143083.7500002.008091e+0770.00000011.0000007.0000005.0000001.000000150.00000015.000000...0.0758260.0620633.5312691.2576872.7845380.5600461.5725083.2769182.772742-0.010769
max741887.000000233028.0000002.019040e+07248.00000039.0000007.0000006.0000001.00000017700.00000015.000000...0.1359000.18009136.36498626.04357222.59844116.33305120.27363311.6918517.9703038.749647

8 rows × 39 columns

Train_data.shape
(250000, 40)
Test_data.shape
(50000, 39)
#查看数据信息
df.info()
#查看缺失值
df.isnull().sum()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 40 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   SaleID             300000 non-null  int64  
 1   name               300000 non-null  int64  
 2   regDate            300000 non-null  int64  
 3   model              300000 non-null  float64
 4   brand              300000 non-null  int64  
 5   bodyType           269510 non-null  float64
 6   fuelType           273108 non-null  float64
 7   gearbox            283774 non-null  float64
 8   power              300000 non-null  int64  
 9   kilometer          300000 non-null  float64
 10  notRepairedDamage  241836 non-null  float64
 11  regionCode         300000 non-null  int64  
 12  seller             300000 non-null  int64  
 13  offerType          300000 non-null  int64  
 14  creatDate          300000 non-null  int64  
 15  price              250000 non-null  float64
 16  v_0                300000 non-null  float64
 17  v_1                300000 non-null  float64
 18  v_2                300000 non-null  float64
 19  v_3                300000 non-null  float64
 20  v_4                300000 non-null  float64
 21  v_5                300000 non-null  float64
 22  v_6                300000 non-null  float64
 23  v_7                300000 non-null  float64
 24  v_8                300000 non-null  float64
 25  v_9                300000 non-null  float64
 26  v_10               300000 non-null  float64
 27  v_11               300000 non-null  float64
 28  v_12               300000 non-null  float64
 29  v_13               300000 non-null  float64
 30  v_14               300000 non-null  float64
 31  v_15               300000 non-null  float64
 32  v_16               300000 non-null  float64
 33  v_17               300000 non-null  float64
 34  v_18               300000 non-null  float64
 35  v_19               300000 non-null  float64
 36  v_20               300000 non-null  float64
 37  v_21               300000 non-null  float64
 38  v_22               300000 non-null  float64
 39  v_23               300000 non-null  float64
dtypes: float64(31), int64(9)
memory usage: 91.6 MB





SaleID                   0
name                     0
regDate                  0
model                    0
brand                    0
bodyType             30490
fuelType             26892
gearbox              16226
power                    0
kilometer                0
notRepairedDamage    58164
regionCode               0
seller                   0
offerType                0
creatDate                0
price                50000
v_0                      0
v_1                      0
v_2                      0
v_3                      0
v_4                      0
v_5                      0
v_6                      0
v_7                      0
v_8                      0
v_9                      0
v_10                     0
v_11                     0
v_12                     0
v_13                     0
v_14                     0
v_15                     0
v_16                     0
v_17                     0
v_18                     0
v_19                     0
v_20                     0
v_21                     0
v_22                     0
v_23                     0
dtype: int64
# nan可视化
missing = Train_data.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.plot.bar()
<AxesSubplot:>




![在这里插入图片描述](https://img-blog.csdnimg.cn/20210420093945341.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2xpdV9kb25nZA==,size_16,color_FFFFFF,t_70#pic_center)

字段表如下,可以从上述输出看到,bodyType,fuelType,gearbox,otRepairedDamage这几个项存在着缺失值,缺失值的个数均在10000个以上,缺失值的个数较多,在后续处理中需要考虑是进一步填充(众数,中位数,随机森林),还是删掉。

FieldDescription
SaleID交易ID,唯一编码
name汽车交易名称,已脱敏
regDate汽车注册日期,例如20160101,2016年01月01日
model车型编码,已脱敏
brand汽车品牌,已脱敏
bodyType车身类型:豪华轿车:0,微型车:1,厢型车:2,大巴车:3,敞篷车:4,双门汽车:5,商务车:6,搅拌车:7
fuelType燃油类型:汽油:0,柴油:1,液化石油气:2,天然气:3,混合动力:4,其他:5,电动:6
gearbox变速箱:手动:0,自动:1
power发动机功率:范围 [ 0, 600 ]
kilometer汽车已行驶公里,单位万km
notRepairedDamage汽车有尚未修复的损坏:是:0,否:1
regionCode地区编码,已脱敏
seller销售方:个体:0,非个体:1
offerType报价类型:提供:0,请求:1
creatDate汽车上线时间,即开始售卖时间
price二手车交易价格(预测目标)
v系列特征匿名特征,包含v0-23在内24个匿名特征
# 查看非匿名类别特征nunique分布
cat_fea = ['name', 'regDate', 'model', 'brand', 'bodyType', 'fuelType',
           'gearbox', 'kilometer', 'notRepairedDamage', 'regionCode',
           'seller', 'offerType', 'creatDate', 'power', 'price']
for fea in cat_fea:
    print(fea + "的特征分布如下:")
    print("{}特征有个{}不同的值".format(fea,df[fea].nunique()))
    print(df[fea].value_counts())
name的特征分布如下:
name特征有个193159不同的值
73        527
451       526
1791      497
821       480
243       423
         ... 
224566      1
204088      1
212284      1
210237      1
84137       1
Name: name, Length: 193159, dtype: int64
regDate的特征分布如下:
regDate特征有个7797不同的值
20000010    356
20000007    353
20000002    351
20000001    336
20000006    328
           ... 
19700511      1
19780406      1
19610603      1
19840603      1
19640202      1
Name: regDate, Length: 7797, dtype: int64
model的特征分布如下:
model特征有个251不同的值
0.0      24260
6.0      21237
4.0      16639
1.0      16440
12.0     10586
         ...  
247.0        7
226.0        7
243.0        5
249.0        4
250.0        1
Name: model, Length: 251, dtype: int64
brand的特征分布如下:
brand特征有个40不同的值
0     64396
4     32573
11    32318
10    28509
1     26534
6     20698
9     14618
5      8877
15     7825
12     5633
7      4621
3      4567
17     4275
13     4181
8      4040
28     3806
19     3095
18     2938
16     2732
22     2694
23     2485
14     2308
24     2068
25     1908
20     1903
27     1657
29     1495
34     1169
30      737
2       671
21      643
31      627
38      608
35      491
32      476
36      450
33      429
37      385
26      379
39      181
Name: brand, dtype: int64
bodyType的特征分布如下:
bodyType特征有个8不同的值
7.0    77319
3.0    64666
4.0    54789
5.0    24518
6.0    18369
2.0    15239
1.0    11862
0.0     2748
Name: bodyType, dtype: int64
fuelType的特征分布如下:
fuelType特征有个7不同的值
0.0    180709
5.0     87139
4.0      4331
3.0       458
2.0       226
1.0       170
6.0        75
Name: fuelType, dtype: int64
gearbox的特征分布如下:
gearbox特征有个2不同的值
1.0    221580
0.0     62194
Name: gearbox, dtype: int64
kilometer的特征分布如下:
kilometer特征有个13不同的值
15.0    194697
12.5     30744
10.0     12840
9.0      10101
8.0       8939
7.0       7905
6.0       7016
5.0       6150
0.5       5588
4.0       5079
3.0       4877
2.0       4553
1.0       1511
Name: kilometer, dtype: int64
notRepairedDamage的特征分布如下:
notRepairedDamage特征有个2不同的值
1.0    212477
0.0     29359
Name: notRepairedDamage, dtype: int64
regionCode的特征分布如下:
regionCode特征有个8122不同的值
487     672
868     517
149     280
539     273
32      262
       ... 
8145      1
8088      1
7989      1
7523      1
6712      1
Name: regionCode, Length: 8122, dtype: int64
seller的特征分布如下:
seller特征有个2不同的值
1    299999
0         1
Name: seller, dtype: int64
offerType的特征分布如下:
offerType特征有个2不同的值
0    299990
1        10
Name: offerType, dtype: int64
creatDate的特征分布如下:
creatDate特征有个110不同的值
20160403    11671
20160404    11387
20160320    10934
20160312    10776
20160321    10744
            ...  
20151123        1
20160114        1
20151014        1
20140310        1
20160115        1
Name: creatDate, Length: 110, dtype: int64
power的特征分布如下:
power特征有个738不同的值
0       32800
75      19385
60      12911
150     12430
140     11012
        ...  
312         1
417         1
672         1
1082        1
1653        1
Name: power, Length: 738, dtype: int64
price的特征分布如下:
price特征有个4585不同的值
0.0        7312
500.0      3815
1500.0     3587
1000.0     3149
1200.0     3071
           ... 
11140.0       1
6165.0        1
5827.0        1
89700.0       1
706.0         1
Name: price, Length: 4585, dtype: int64
#类别特征取值较少的,画出直方图,
#'seller,offerType'数值分布极不平衡
plt.figure()
plt.figure(figsize=(16, 6))
i = 1
for fea in cat_fea:
    if df[fea].nunique()<50:
        plt.subplot(2, 4, i)
        i += 1
        v = df[fea].value_counts()
        fig = sns.barplot(x=v.index, y=v.values)
        for item in fig.get_xticklabels():
            item.set_rotation(90)
        plt.title(fea)
plt.tight_layout()
plt.show()
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210420094014831.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2xpdV9kb25nZA==,size_16,color_FFFFFF,t_70#pic_center)
  • seller特征有个2不同的值分别为1(299999)和0(1)这种情况下可以考虑把该特征去掉。
  • offerType特征有个2不同的值分别为1(10)和0(299990)这种情况下可以考虑把该特征去掉。
# 绘制price的图像
plt.figure()
plt.figure(figsize=(10, 3))
plt.subplot(1, 2,1)
sns.distplot(Train_data['price'])
plt.subplot(1,2,2)
Train_data['price'].plot.box()
plt.tight_layout()

在这里插入图片描述

  • price呈现长尾分布,后续需要处理,对数变换
#'price'转化后的分布
plt.figure()
sns.distplot(np.log1p(Train_data['price']))

在这里插入图片描述

  • 转换后的price并不是一个标准的正态分布的形状,可以看到有两个峰,左边峰值较小,可以考虑进行处理。
# 处理price,去掉为0的行
df.drop(df[df['price'] == 0].index, inplace=True)
plt.figure()
sns.distplot(np.log1p(df['price']))

在这里插入图片描述

#探索匿名特征的数值特征分布
num_fea = ['v_0', 'v_1', 'v_2', 'v_3',
       'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12',
       'v_13', 'v_14', 'v_15', 'v_16', 'v_17', 'v_18', 'v_19', 'v_20', 'v_21',
       'v_22', 'v_23']

f = pd.melt(Train_data, value_vars=num_fea)
g = sns.FacetGrid(f, col="variable",  col_wrap=3, sharex=False, sharey=False)
g = g.map(sns.distplot, "value")

在这里插入图片描述

#选取类别特征取值较少的,观察它们与价格的均值分布
plt.figure()
plt.figure(figsize=(20, 18))
i = 1
for f in cat_fea:
    if df[f].nunique() <= 50:
        plt.subplot(5, 3, i)
        i += 1
        v = df[~df['price'].isnull()].groupby(f,as_index=False)['price'].agg({f + '_price_mean': 'mean'}).reset_index()
        fig = sns.barplot(x=f, y=f + '_price_mean', data=v)
        for item in fig.get_xticklabels():
            item.set_rotation(90)
plt.tight_layout()
plt.show()

在这里插入图片描述

# 相关性分析
num_fea.append('price')
corr1 = abs(df[df['price'].notnull()][num_fea].corr())
plt.figure(figsize=(10, 10))
sns.heatmap(corr1, linewidths=0.1, cmap=sns.cm.rocket_r)

在这里插入图片描述


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值