datawhale二手车数据挖掘项目学习task2

task2-(探索性)数据分析

  • 导入需要的包(根据需要逐步增加)
import numpy as np
import pandas as pd
import warnings
import matplotlib
import matplotlib.pyplot as plt
from IPython.display import Image
from datetime import datetime
import seaborn as sns
from scipy.special import jn
from IPython.display import display, clear_output
import time

warnings.filterwarnings('ignore')
%matplotlib inline

## 模型预测的
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor

## 数据降维处理的
from sklearn.decomposition import PCA,FastICA,FactorAnalysis,SparsePCA

import lightgbm as lgb
import xgboost as xgb

## 参数搜索和评价的
from sklearn.model_selection import GridSearchCV,cross_val_score,StratifiedKFold,train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

Step 1:数据读取

#查看数据
#!type data\used_car_train_20200313.csv
# 是以tab分割的数据.
# 有header,直接作为列名
# 使用read_csv函数读取数据
# 分隔符参数为 tab 
train = pd.read_csv(r'data\used_car_train_20200313.csv', sep=' ')
test_a = pd.read_csv(r'data\used_car_testA_20200313.csv', sep=' ')

1) 数据浏览

# 检测读取结果
train.shape,train.columns,test_a.shape,test_a.columns,
((150000, 31),
 Index(['SaleID', 'name', 'regDate', 'model', 'brand', 'bodyType', 'fuelType',
        'gearbox', 'power', 'kilometer', 'notRepairedDamage', 'regionCode',
        'seller', 'offerType', 'creatDate', 'price', 'v_0', 'v_1', 'v_2', 'v_3',
        'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12',
        'v_13', 'v_14'],
       dtype='object'),
 (50000, 30),
 Index(['SaleID', 'name', 'regDate', 'model', 'brand', 'bodyType', 'fuelType',
        'gearbox', 'power', 'kilometer', 'notRepairedDamage', 'regionCode',
        'seller', 'offerType', 'creatDate', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4',
        'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13',
        'v_14'],
       dtype='object'))
[x for x in train.columns if x not in test_a.columns]
# 测试数据没有price列,其他列名都一样
['price']
# 浏览读取到的数据的形式
train.head().T
01234
SaleID01234
name73622621487471865111080
regDate2004040220030301200404031996090820120103
model3040115109110
brand6115105
bodyType12101
fuelType00000
gearbox00010
power60016319368
kilometer12.51512.5155
notRepairedDamage0.0-0.00.00.0
regionCode1046436628064346977
seller00000
offerType00000
creatDate2016040420160309201604022016031220160313
price18503600622224005200
v_043.357845.305345.978445.687544.3835
v_13.966345.236114.823794.492572.03143
v_20.05025710.1379251.31952-0.05061580.572169
v_32.159741.38066-0.9984670.8836-1.57124
v_41.14379-1.42216-0.996911-2.228082.24609
v_50.2356760.2647770.251410.2742930.228036
v_60.1019880.1210040.1149120.11030.0732051
v_70.1295490.1357310.1651470.1219640.0918805
v_80.02281640.02659740.06217280.03339450.0788194
v_90.09746180.02058170.027074800.121534
v_10-2.8818-4.90048-4.84675-4.5096-1.89624
v_112.80412.096341.803561.285940.910783
v_12-2.42082-1.030481.56533-0.5018680.93111
v_130.795292-1.72267-0.832687-2.438352.83452
v_140.9147620.245522-0.229963-0.4786991.92348

2) 数据信息查看

# 通过 .info() 简要可以看到对应一些数据列名,以及NAN缺失信息
train.info()
# 有缺失值的列: bodyType             145494 non-null float64
#              fuelType             141320 non-null float64
#              gearbox              144019 non-null float64
# 此外,通过上边的浏览发现,有些列明明有缺失值(例如以 - 表示的notRepairedDamage),但info()结果没有
# 需要统计各列的唯一值情况,将明显属于缺失值的非标准缺失值表示法统一,或替换为 NaN
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 31 columns):
SaleID               150000 non-null int64
name                 150000 non-null int64
regDate              150000 non-null int64
model                149999 non-null float64
brand                150000 non-null int64
bodyType             145494 non-null float64
fuelType             141320 non-null float64
gearbox              144019 non-null float64
power                150000 non-null int64
kilometer            150000 non-null float64
notRepairedDamage    150000 non-null object
regionCode           150000 non-null int64
seller               150000 non-null int64
offerType            150000 non-null int64
creatDate            150000 non-null int64
price                150000 non-null int64
v_0                  150000 non-null float64
v_1                  150000 non-null float64
v_2                  150000 non-null float64
v_3                  150000 non-null float64
v_4                  150000 non-null float64
v_5                  150000 non-null float64
v_6                  150000 non-null float64
v_7                  150000 non-null float64
v_8                  150000 non-null float64
v_9                  150000 non-null float64
v_10                 150000 non-null float64
v_11                 150000 non-null float64
v_12                 150000 non-null float64
v_13                 150000 non-null float64
v_14                 150000 non-null float64
dtypes: float64(20), int64(10), object(1)
memory usage: 35.5+ MB
#测试集
test_a.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 30 columns):
SaleID               50000 non-null int64
name                 50000 non-null int64
regDate              50000 non-null int64
model                50000 non-null float64
brand                50000 non-null int64
bodyType             48587 non-null float64
fuelType             47107 non-null float64
gearbox              48090 non-null float64
power                50000 non-null int64
kilometer            50000 non-null float64
notRepairedDamage    50000 non-null object
regionCode           50000 non-null int64
seller               50000 non-null int64
offerType            50000 non-null int64
creatDate            50000 non-null int64
v_0                  50000 non-null float64
v_1                  50000 non-null float64
v_2                  50000 non-null float64
v_3                  50000 non-null float64
v_4                  50000 non-null float64
v_5                  50000 non-null float64
v_6                  50000 non-null float64
v_7                  50000 non-null float64
v_8                  50000 non-null float64
v_9                  50000 non-null float64
v_10                 50000 non-null float64
v_11                 50000 non-null float64
v_12                 50000 non-null float64
v_13                 50000 non-null float64
v_14                 50000 non-null float64
dtypes: float64(20), int64(9), object(1)
memory usage: 11.4+ MB
train.describe().T
# 查看描述信息
countmeanstdmin25%50%75%max
SaleID150000.07.499950e+0443301.4145270.000000e+003.749975e+047.499950e+041.124992e+051.499990e+05
name150000.06.834917e+0461103.8750950.000000e+001.115600e+045.163800e+041.188412e+051.968120e+05
regDate150000.02.003417e+0753649.8792551.991000e+071.999091e+072.003091e+072.007111e+072.015121e+07
model149999.04.712902e+0149.5360400.000000e+001.000000e+013.000000e+016.600000e+012.470000e+02
brand150000.08.052733e+007.8649560.000000e+001.000000e+006.000000e+001.300000e+013.900000e+01
bodyType145494.01.792369e+001.7606400.000000e+000.000000e+001.000000e+003.000000e+007.000000e+00
fuelType141320.03.758421e-010.5486770.000000e+000.000000e+000.000000e+001.000000e+006.000000e+00
gearbox144019.02.249425e-010.4175460.000000e+000.000000e+000.000000e+000.000000e+001.000000e+00
power150000.01.193165e+02177.1684190.000000e+007.500000e+011.100000e+021.500000e+021.931200e+04
kilometer150000.01.259716e+013.9195765.000000e-011.250000e+011.500000e+011.500000e+011.500000e+01
regionCode150000.02.583077e+031885.3632180.000000e+001.018000e+032.196000e+033.843000e+038.120000e+03
seller150000.06.666667e-060.0025820.000000e+000.000000e+000.000000e+000.000000e+001.000000e+00
offerType150000.00.000000e+000.0000000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+00
creatDate150000.02.016033e+07106.7328092.015062e+072.016031e+072.016032e+072.016033e+072.016041e+07
price150000.05.923327e+037501.9984771.100000e+011.300000e+033.250000e+037.700000e+039.999900e+04
v_0150000.04.440627e+012.4575483.045198e+014.313580e+014.461027e+014.600472e+015.230418e+01
v_1150000.0-4.480912e-023.641893-4.295589e+00-3.192349e+00-3.052671e+004.000670e+007.320308e+00
v_2150000.08.076506e-022.929618-4.470671e+00-9.706712e-01-3.829469e-012.413349e-011.903550e+01
v_3150000.07.883342e-022.026514-7.275037e+00-1.462580e+009.972198e-021.565838e+009.854702e+00
v_4150000.01.787461e-021.193661-4.364565e+00-9.211915e-01-7.591043e-028.687584e-016.829352e+00
v_5150000.02.482035e-010.0458040.000000e+002.436154e-012.577980e-012.652973e-012.918381e-01
v_6150000.04.492300e-020.0517430.000000e+003.811100e-058.120586e-041.020093e-011.514196e-01
v_7150000.01.246925e-010.2014100.000000e+006.247353e-029.586590e-021.252429e-011.404936e+00
v_8150000.05.814385e-020.0291860.000000e+003.533369e-025.701360e-027.938157e-021.607910e-01
v_9150000.06.199589e-020.0356920.000000e+003.393018e-025.848367e-028.749055e-022.227875e-01
v_10150000.0-1.000239e-033.772386-9.168192e+00-3.722303e+001.624076e+002.844357e+001.235701e+01
v_11150000.09.034543e-033.286071-5.558207e+00-1.951543e+00-3.580527e-011.255022e+001.881904e+01
v_12150000.04.812595e-032.517478-9.639552e+00-1.871846e+00-1.307533e-011.776933e+001.384779e+01
v_13150000.03.126119e-041.288988-4.153899e+00-1.057789e+00-3.624460e-029.428131e-011.114767e+01
v_14150000.0-6.882314e-041.038685-6.546556e+00-4.370337e-011.412460e-016.803781e-018.658418e+00
#测试集
test_a.describe().T
countmeanstdmin25%50%75%max
SaleID50000.01.749995e+0514433.9010671.500000e+051.624998e+051.749995e+051.874992e+051.999990e+05
name50000.06.854222e+0461052.8081330.000000e+001.120350e+045.224850e+041.188565e+051.968050e+05
regDate50000.02.003393e+0753688.6968521.991000e+071.999091e+072.003091e+072.007110e+072.015121e+07
model50000.04.684452e+0149.4695480.000000e+001.000000e+012.900000e+016.500000e+012.460000e+02
brand50000.08.056240e+007.8194770.000000e+001.000000e+006.000000e+001.300000e+013.900000e+01
bodyType48587.01.782185e+001.7607360.000000e+000.000000e+001.000000e+003.000000e+007.000000e+00
fuelType47107.03.734052e-010.5464420.000000e+000.000000e+000.000000e+001.000000e+006.000000e+00
gearbox48090.02.243502e-010.4171580.000000e+000.000000e+000.000000e+000.000000e+001.000000e+00
power50000.01.198836e+02185.0973870.000000e+007.500000e+011.090000e+021.500000e+022.000000e+04
kilometer50000.01.259558e+013.9089795.000000e-011.250000e+011.500000e+011.500000e+011.500000e+01
regionCode50000.02.590605e+031876.9702630.000000e+001.030000e+032.219000e+033.857000e+038.121000e+03
seller50000.00.000000e+000.0000000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+00
offerType50000.00.000000e+000.0000000.000000e+000.000000e+000.000000e+000.000000e+000.000000e+00
creatDate50000.02.016033e+0779.5152062.015061e+072.016031e+072.016032e+072.016033e+072.016041e+07
v_050000.04.441823e+012.4299502.898702e+014.313962e+014.461108e+014.599264e+015.175168e+01
v_150000.0-3.723779e-023.642562-4.137733e+00-3.191909e+00-3.050756e+003.997323e+007.553517e+00
v_250000.05.053403e-022.856341-4.205728e+00-9.712663e-01-3.881169e-012.405479e-011.839457e+01
v_350000.08.463990e-022.026510-5.638184e+00-1.453453e+009.788124e-021.562700e+009.381599e+00
v_450000.01.500124e-021.193026-4.287718e+00-9.280888e-01-7.022500e-028.637309e-015.270150e+00
v_550000.02.486691e-010.0446010.000000e+002.437621e-012.578770e-012.653276e-012.916184e-01
v_650000.04.502089e-020.0517660.000000e+004.439681e-058.147688e-041.020253e-011.532654e-01
v_750000.01.227444e-010.1959720.000000e+006.264359e-029.582813e-021.254379e-011.358813e+00
v_850000.05.799740e-020.0292110.000000e+003.508446e-025.708391e-027.907748e-021.563546e-01
v_950000.06.199954e-020.0356530.000000e+003.371386e-025.876382e-028.748899e-022.147753e-01
v_1050000.0-1.785456e-023.747985-9.160049e+00-3.700121e+001.613212e+002.832708e+001.233887e+01
v_1150000.0-1.374166e-023.231258-5.411964e+00-1.971325e+00-3.558430e-011.262914e+001.885622e+01
v_1250000.0-1.355400e-022.515962-8.916949e+00-1.876703e+00-1.427786e-011.764335e+001.295050e+01
v_1350000.0-3.146596e-031.286597-4.123333e+00-1.060428e+00-3.595595e-029.414688e-015.913273e+00
v_1450000.01.516081e-031.027360-6.112667e+00-4.379205e-011.387993e-016.811630e-012.624622e+00
for col in train.columns[:16]:
    print(train[col].value_counts())
2047      1
113949    1
15661     1
13612     1
3371      1
         ..
8913      1
10960     1
53967     1
56014     1
0         1
Name: SaleID, Length: 150000, dtype: int64
708       282
387       282
55        280
1541      263
203       233
         ... 
5074        1
7123        1
11221       1
13270       1
174485      1
Name: name, Length: 99662, dtype: int64
20000008    180
20000011    158
20000004    157
20000010    157
20000002    155
           ... 
19910807      1
19910902      1
20151209      1
19911011      1
20151201      1
Name: regDate, Length: 3894, dtype: int64
0.0      11762
19.0      9573
4.0       8445
1.0       6038
29.0      5186
         ...  
245.0        2
209.0        2
240.0        2
242.0        2
247.0        1
Name: model, Length: 248, dtype: int64
0     31480
4     16737
14    16089
10    14249
1     13794
6     10217
9      7306
5      4665
13     3817
11     2945
3      2461
7      2361
16     2223
8      2077
25     2064
27     2053
21     1547
15     1458
19     1388
20     1236
12     1109
22     1085
26      966
30      940
17      913
24      772
28      649
32      592
29      406
37      333
2       321
31      318
18      316
36      228
34      227
33      218
23      186
35      180
38       65
39        9
Name: brand, dtype: int64
0.0    41420
1.0    35272
2.0    30324
3.0    13491
4.0     9609
5.0     7607
6.0     6482
7.0     1289
Name: bodyType, dtype: int64
0.0    91656
1.0    46991
2.0     2212
3.0      262
4.0      118
5.0       45
6.0       36
Name: fuelType, dtype: int64
0.0    111623
1.0     32396
Name: gearbox, dtype: int64
75.0      9593
150.0     6495
60.0      6374
140.0     5963
101.0     5537
          ... 
1401.0       1
1164.0       1
1162.0       1
470.0        1
1149.0       1
Name: power, Length: 565, dtype: int64
15.0    96877
12.5    15722
10.0     6459
9.0      5257
8.0      4573
7.0      4084
6.0      3725
5.0      3144
4.0      2718
3.0      2501
2.0      2354
0.5      1840
1.0       746
Name: kilometer, dtype: int64
0.0    111361
1.0     14315
Name: notRepairedDamage, dtype: int64
419     369
764     258
125     137
176     136
462     134
       ... 
6414      1
7063      1
4239      1
5931      1
7267      1
Name: regionCode, Length: 7905, dtype: int64
0    149999
1         1
Name: seller, dtype: int64
0    150000
Name: offerType, dtype: int64
20160403    5848
20160404    5606
20160320    5485
20160312    5383
20160402    5382
            ... 
20151227       1
20151217       1
20160131       1
20160130       1
20160115       1
Name: creatDate, Length: 96, dtype: int64
500      2337
1500     2158
1200     1922
1000     1850
2500     1821
         ... 
25321       1
8886        1
8801        1
37920       1
8188        1
Name: price, Length: 3763, dtype: int64
for col in test_a.columns[:13]:
    print(test_a[col].value_counts())
198655    1
167227    1
171353    1
169304    1
191831    1
         ..
187054    1
180909    1
182956    1
193195    1
196608    1
Name: SaleID, Length: 50000, dtype: int64
55       97
708      96
387      95
1541     88
713      74
         ..
22270     1
89855     1
42752     1
48899     1
11808     1
Name: name, Length: 37453, dtype: int64
20000004    60
20000011    56
20000012    54
20000009    53
20000005    50
            ..
19921208     1
20120007     1
19920804     1
19930905     1
19920801     1
Name: regDate, Length: 3835, dtype: int64
0.0      3896
19.0     3245
4.0      3007
1.0      1981
29.0     1742
         ... 
242.0       1
240.0       1
244.0       1
243.0       1
246.0       1
Name: model, Length: 247, dtype: int64
0     10348
4      5763
14     5314
10     4766
1      4532
6      3502
9      2423
5      1569
13     1245
11      919
7       795
3       773
16      771
8       704
25      695
27      650
21      544
15      511
20      450
19      450
12      389
22      363
30      324
17      317
26      303
24      268
28      225
32      193
29      117
31      115
18      106
2       104
37       92
34       77
33       76
36       67
23       62
35       53
38       23
39        2
Name: brand, dtype: int64
0.0    13985
1.0    11882
2.0     9900
3.0     4433
4.0     3303
5.0     2537
6.0     2116
7.0      431
Name: bodyType, dtype: int64
0.0    30656
1.0    15544
2.0      774
3.0       72
4.0       37
6.0       14
5.0       10
Name: fuelType, dtype: int64
0.0    37301
1.0    10789
Name: gearbox, dtype: int64
0       4195
75      3226
150     2180
60      2164
140     1949
        ... 
1000       1
392        1
296        1
375        1
6045       1
Name: power, Length: 445, dtype: int64
15.0    32189
12.5     5346
10.0     2106
9.0      1791
8.0      1523
7.0      1408
6.0      1218
5.0      1093
4.0       898
3.0       849
2.0       717
0.5       616
1.0       246
Name: kilometer, dtype: int64
0.0    37249
-       8031
1.0     4720
Name: notRepairedDamage, dtype: int64
419     146
764      78
188      52
125      51
759      51
       ... 
7753      1
7463      1
7230      1
826       1
112       1
Name: regionCode, Length: 6971, dtype: int64
0    50000
Name: seller, dtype: int64

进行变量划分

  • 通过观察上述统计信息发现:

    • v0-v14这15个匿名变量的不同取值数量基本等同于样本数量,可以视为连续型变量
      非匿名变量里的变量类型划分
    • 01.SaleID-具有唯一性的索引变量
  • 连续型变量:

    • 02.regDate-汽车注册日期:用整数表示的日期,观察发现有一些非法的日期表示(如20000008)需先处理为合法日期,再用creatDatae去减,得到交易时的使用时长
    • 04.power-发动机功率:取值范围[0,600],功率为0的怎么理解?把0当作缺失值?
    • 05.createdDate-汽车开始售卖时间:用整数表示的日期
  • 离散型数值变量:

    • 03.kilometer-已行驶里程(万公里):取值范围[0.5,15],可视为数值型离散变量
  • 分类变量

    • 06.name-汽车交易名称:绝大多数(超过75%)情况下具有唯一性,
    • 07.brand-汽车品牌:分类变量
    • 08.model-车型编码:
    • 09.bodyType-车身类型:分类变量,七种不同取值
    • 10.fuelType-燃油类型:分类变量七种不同取值
    • 11.gearbox-变速箱:分类变量,两种取值
    • 12.notRepairedDamage-是否尚有未修复的损坏:分类变量,正常值为0或1.取值为"-"的有 24324 个,应视为缺失值.
    • 13.regionCode-地区编码: 7905种不同取值,分类变量.
    • 14.seller-销售方:分类变量,两种取值,个体或非个体.
    • 15.offerType-报价类型:

price-需要预测的连续型变量

# 借助baseline的字段说明
Image('1584102211.png')

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-gBLQC79o-1585055047038)(output_18_0.png)]

train.bodyType.dtype
dtype('float64')

# 统计
变量名称,值域(唯一值数量),

3) 数据清洗

  • 根据上述对各个字段的理解,进行缺失值替换等数据清洗工作.
# 替换 notRepairedDamage 的 '-'
train.notRepairedDamage.replace('-', np.nan, inplace=True)
# 替换 power 列的0值为缺失值
train.power.replace(0, np.nan, inplace=True)
# 将 regDate字段用当前日期减,得到注册日期距今的时长(月份)
train.regDate.sort_values()
103842    19910001
56687     19910001
27475     19910001
68835     19910001
81940     19910001
            ...   
7698      20151210
80851     20151211
133533    20151211
12482     20151211
12617     20151212
Name: regDate, Length: 150000, dtype: int64
train.regDate.value_counts()
20000008    180
20000011    158
20000004    157
20000010    157
20000002    155
           ... 
19910807      1
19910902      1
20151209      1
19911011      1
20151201      1
Name: regDate, Length: 3894, dtype: int64
train.creatDate.value_counts()
20160403    5848
20160404    5606
20160320    5485
20160312    5383
20160402    5382
            ... 
20151227       1
20151217       1
20160131       1
20160130       1
20160115       1
Name: creatDate, Length: 96, dtype: int64
# 定义一个函数做日期转换,对非法的日期格式进行处理
def todate(strdate):
    try:
        strdate=datetime.strptime(str(strdate),'%Y%m%d').date()
        """将合法的整数型日期格式转换为日期"""
    except ValueError:
        strdate=datetime.strptime(str(strdate)[:4],'%Y').date()
        """统一将非法的日期格式转换为该年第一天"""
    return strdate
todate(20150024)
datetime.date(2015, 1, 1)
#测试成功
train.regDate.apply(lambda x:todate(x))
0         2004-04-02
1         2003-03-01
2         2004-04-03
3         1996-09-08
4         2012-01-03
             ...    
149995    2000-06-07
149996    2009-11-02
149997    2010-10-03
149998    2006-03-12
149999    1999-02-04
Name: regDate, Length: 150000, dtype: object
# 将 regDate 和 creatDate 使用上述转换函数进行替换
train.regDate=train.regDate.apply(lambda x:todate(x))
train.creatDate=train.creatDate.apply(lambda x:todate(x))
# 增加计算列:creatDate 减去 regDate
train['usedDay']=train.creatDate-train.regDate
# 得到一个连续变量,表示车辆使用天数
train['usedDay'].value_counts()
5933 days    115
5911 days    113
5932 days     99
5918 days     99
5938 days     95
            ... 
276 days       1
459 days       1
8772 days      1
8562 days      1
8924 days      1
Name: usedDay, Length: 9054, dtype: int64

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值