task2-(探索性)数据分析
- 导入需要的包(根据需要逐步增加)
import numpy as np
import pandas as pd
import warnings
import matplotlib
import matplotlib.pyplot as plt
from IPython.display import Image
from datetime import datetime
import seaborn as sns
from scipy.special import jn
from IPython.display import display, clear_output
import time
warnings.filterwarnings('ignore')
%matplotlib inline
## 模型预测的
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
## 数据降维处理的
from sklearn.decomposition import PCA,FastICA,FactorAnalysis,SparsePCA
import lightgbm as lgb
import xgboost as xgb
## 参数搜索和评价的
from sklearn.model_selection import GridSearchCV,cross_val_score,StratifiedKFold,train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
Step 1:数据读取
#查看数据
#!type data\used_car_train_20200313.csv
# 是以tab分割的数据.
# 有header,直接作为列名
# 使用read_csv函数读取数据
# 分隔符参数为 tab
train = pd.read_csv(r'data\used_car_train_20200313.csv', sep=' ')
test_a = pd.read_csv(r'data\used_car_testA_20200313.csv', sep=' ')
1) 数据浏览
# 检测读取结果
train.shape,train.columns,test_a.shape,test_a.columns,
((150000, 31),
Index(['SaleID', 'name', 'regDate', 'model', 'brand', 'bodyType', 'fuelType',
'gearbox', 'power', 'kilometer', 'notRepairedDamage', 'regionCode',
'seller', 'offerType', 'creatDate', 'price', 'v_0', 'v_1', 'v_2', 'v_3',
'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12',
'v_13', 'v_14'],
dtype='object'),
(50000, 30),
Index(['SaleID', 'name', 'regDate', 'model', 'brand', 'bodyType', 'fuelType',
'gearbox', 'power', 'kilometer', 'notRepairedDamage', 'regionCode',
'seller', 'offerType', 'creatDate', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4',
'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13',
'v_14'],
dtype='object'))
[x for x in train.columns if x not in test_a.columns]
# 测试数据没有price列,其他列名都一样
['price']
# 浏览读取到的数据的形式
train.head().T
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
SaleID | 0 | 1 | 2 | 3 | 4 |
name | 736 | 2262 | 14874 | 71865 | 111080 |
regDate | 20040402 | 20030301 | 20040403 | 19960908 | 20120103 |
model | 30 | 40 | 115 | 109 | 110 |
brand | 6 | 1 | 15 | 10 | 5 |
bodyType | 1 | 2 | 1 | 0 | 1 |
fuelType | 0 | 0 | 0 | 0 | 0 |
gearbox | 0 | 0 | 0 | 1 | 0 |
power | 60 | 0 | 163 | 193 | 68 |
kilometer | 12.5 | 15 | 12.5 | 15 | 5 |
notRepairedDamage | 0.0 | - | 0.0 | 0.0 | 0.0 |
regionCode | 1046 | 4366 | 2806 | 434 | 6977 |
seller | 0 | 0 | 0 | 0 | 0 |
offerType | 0 | 0 | 0 | 0 | 0 |
creatDate | 20160404 | 20160309 | 20160402 | 20160312 | 20160313 |
price | 1850 | 3600 | 6222 | 2400 | 5200 |
v_0 | 43.3578 | 45.3053 | 45.9784 | 45.6875 | 44.3835 |
v_1 | 3.96634 | 5.23611 | 4.82379 | 4.49257 | 2.03143 |
v_2 | 0.0502571 | 0.137925 | 1.31952 | -0.0506158 | 0.572169 |
v_3 | 2.15974 | 1.38066 | -0.998467 | 0.8836 | -1.57124 |
v_4 | 1.14379 | -1.42216 | -0.996911 | -2.22808 | 2.24609 |
v_5 | 0.235676 | 0.264777 | 0.25141 | 0.274293 | 0.228036 |
v_6 | 0.101988 | 0.121004 | 0.114912 | 0.1103 | 0.0732051 |
v_7 | 0.129549 | 0.135731 | 0.165147 | 0.121964 | 0.0918805 |
v_8 | 0.0228164 | 0.0265974 | 0.0621728 | 0.0333945 | 0.0788194 |
v_9 | 0.0974618 | 0.0205817 | 0.0270748 | 0 | 0.121534 |
v_10 | -2.8818 | -4.90048 | -4.84675 | -4.5096 | -1.89624 |
v_11 | 2.8041 | 2.09634 | 1.80356 | 1.28594 | 0.910783 |
v_12 | -2.42082 | -1.03048 | 1.56533 | -0.501868 | 0.93111 |
v_13 | 0.795292 | -1.72267 | -0.832687 | -2.43835 | 2.83452 |
v_14 | 0.914762 | 0.245522 | -0.229963 | -0.478699 | 1.92348 |
2) 数据信息查看
# 通过 .info() 简要可以看到对应一些数据列名,以及NAN缺失信息
train.info()
# 有缺失值的列: bodyType 145494 non-null float64
# fuelType 141320 non-null float64
# gearbox 144019 non-null float64
# 此外,通过上边的浏览发现,有些列明明有缺失值(例如以 - 表示的notRepairedDamage),但info()结果没有
# 需要统计各列的唯一值情况,将明显属于缺失值的非标准缺失值表示法统一,或替换为 NaN
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 31 columns):
SaleID 150000 non-null int64
name 150000 non-null int64
regDate 150000 non-null int64
model 149999 non-null float64
brand 150000 non-null int64
bodyType 145494 non-null float64
fuelType 141320 non-null float64
gearbox 144019 non-null float64
power 150000 non-null int64
kilometer 150000 non-null float64
notRepairedDamage 150000 non-null object
regionCode 150000 non-null int64
seller 150000 non-null int64
offerType 150000 non-null int64
creatDate 150000 non-null int64
price 150000 non-null int64
v_0 150000 non-null float64
v_1 150000 non-null float64
v_2 150000 non-null float64
v_3 150000 non-null float64
v_4 150000 non-null float64
v_5 150000 non-null float64
v_6 150000 non-null float64
v_7 150000 non-null float64
v_8 150000 non-null float64
v_9 150000 non-null float64
v_10 150000 non-null float64
v_11 150000 non-null float64
v_12 150000 non-null float64
v_13 150000 non-null float64
v_14 150000 non-null float64
dtypes: float64(20), int64(10), object(1)
memory usage: 35.5+ MB
#测试集
test_a.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 30 columns):
SaleID 50000 non-null int64
name 50000 non-null int64
regDate 50000 non-null int64
model 50000 non-null float64
brand 50000 non-null int64
bodyType 48587 non-null float64
fuelType 47107 non-null float64
gearbox 48090 non-null float64
power 50000 non-null int64
kilometer 50000 non-null float64
notRepairedDamage 50000 non-null object
regionCode 50000 non-null int64
seller 50000 non-null int64
offerType 50000 non-null int64
creatDate 50000 non-null int64
v_0 50000 non-null float64
v_1 50000 non-null float64
v_2 50000 non-null float64
v_3 50000 non-null float64
v_4 50000 non-null float64
v_5 50000 non-null float64
v_6 50000 non-null float64
v_7 50000 non-null float64
v_8 50000 non-null float64
v_9 50000 non-null float64
v_10 50000 non-null float64
v_11 50000 non-null float64
v_12 50000 non-null float64
v_13 50000 non-null float64
v_14 50000 non-null float64
dtypes: float64(20), int64(9), object(1)
memory usage: 11.4+ MB
train.describe().T
# 查看描述信息
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
SaleID | 150000.0 | 7.499950e+04 | 43301.414527 | 0.000000e+00 | 3.749975e+04 | 7.499950e+04 | 1.124992e+05 | 1.499990e+05 |
name | 150000.0 | 6.834917e+04 | 61103.875095 | 0.000000e+00 | 1.115600e+04 | 5.163800e+04 | 1.188412e+05 | 1.968120e+05 |
regDate | 150000.0 | 2.003417e+07 | 53649.879255 | 1.991000e+07 | 1.999091e+07 | 2.003091e+07 | 2.007111e+07 | 2.015121e+07 |
model | 149999.0 | 4.712902e+01 | 49.536040 | 0.000000e+00 | 1.000000e+01 | 3.000000e+01 | 6.600000e+01 | 2.470000e+02 |
brand | 150000.0 | 8.052733e+00 | 7.864956 | 0.000000e+00 | 1.000000e+00 | 6.000000e+00 | 1.300000e+01 | 3.900000e+01 |
bodyType | 145494.0 | 1.792369e+00 | 1.760640 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 3.000000e+00 | 7.000000e+00 |
fuelType | 141320.0 | 3.758421e-01 | 0.548677 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 6.000000e+00 |
gearbox | 144019.0 | 2.249425e-01 | 0.417546 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 |
power | 150000.0 | 1.193165e+02 | 177.168419 | 0.000000e+00 | 7.500000e+01 | 1.100000e+02 | 1.500000e+02 | 1.931200e+04 |
kilometer | 150000.0 | 1.259716e+01 | 3.919576 | 5.000000e-01 | 1.250000e+01 | 1.500000e+01 | 1.500000e+01 | 1.500000e+01 |
regionCode | 150000.0 | 2.583077e+03 | 1885.363218 | 0.000000e+00 | 1.018000e+03 | 2.196000e+03 | 3.843000e+03 | 8.120000e+03 |
seller | 150000.0 | 6.666667e-06 | 0.002582 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 |
offerType | 150000.0 | 0.000000e+00 | 0.000000 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
creatDate | 150000.0 | 2.016033e+07 | 106.732809 | 2.015062e+07 | 2.016031e+07 | 2.016032e+07 | 2.016033e+07 | 2.016041e+07 |
price | 150000.0 | 5.923327e+03 | 7501.998477 | 1.100000e+01 | 1.300000e+03 | 3.250000e+03 | 7.700000e+03 | 9.999900e+04 |
v_0 | 150000.0 | 4.440627e+01 | 2.457548 | 3.045198e+01 | 4.313580e+01 | 4.461027e+01 | 4.600472e+01 | 5.230418e+01 |
v_1 | 150000.0 | -4.480912e-02 | 3.641893 | -4.295589e+00 | -3.192349e+00 | -3.052671e+00 | 4.000670e+00 | 7.320308e+00 |
v_2 | 150000.0 | 8.076506e-02 | 2.929618 | -4.470671e+00 | -9.706712e-01 | -3.829469e-01 | 2.413349e-01 | 1.903550e+01 |
v_3 | 150000.0 | 7.883342e-02 | 2.026514 | -7.275037e+00 | -1.462580e+00 | 9.972198e-02 | 1.565838e+00 | 9.854702e+00 |
v_4 | 150000.0 | 1.787461e-02 | 1.193661 | -4.364565e+00 | -9.211915e-01 | -7.591043e-02 | 8.687584e-01 | 6.829352e+00 |
v_5 | 150000.0 | 2.482035e-01 | 0.045804 | 0.000000e+00 | 2.436154e-01 | 2.577980e-01 | 2.652973e-01 | 2.918381e-01 |
v_6 | 150000.0 | 4.492300e-02 | 0.051743 | 0.000000e+00 | 3.811100e-05 | 8.120586e-04 | 1.020093e-01 | 1.514196e-01 |
v_7 | 150000.0 | 1.246925e-01 | 0.201410 | 0.000000e+00 | 6.247353e-02 | 9.586590e-02 | 1.252429e-01 | 1.404936e+00 |
v_8 | 150000.0 | 5.814385e-02 | 0.029186 | 0.000000e+00 | 3.533369e-02 | 5.701360e-02 | 7.938157e-02 | 1.607910e-01 |
v_9 | 150000.0 | 6.199589e-02 | 0.035692 | 0.000000e+00 | 3.393018e-02 | 5.848367e-02 | 8.749055e-02 | 2.227875e-01 |
v_10 | 150000.0 | -1.000239e-03 | 3.772386 | -9.168192e+00 | -3.722303e+00 | 1.624076e+00 | 2.844357e+00 | 1.235701e+01 |
v_11 | 150000.0 | 9.034543e-03 | 3.286071 | -5.558207e+00 | -1.951543e+00 | -3.580527e-01 | 1.255022e+00 | 1.881904e+01 |
v_12 | 150000.0 | 4.812595e-03 | 2.517478 | -9.639552e+00 | -1.871846e+00 | -1.307533e-01 | 1.776933e+00 | 1.384779e+01 |
v_13 | 150000.0 | 3.126119e-04 | 1.288988 | -4.153899e+00 | -1.057789e+00 | -3.624460e-02 | 9.428131e-01 | 1.114767e+01 |
v_14 | 150000.0 | -6.882314e-04 | 1.038685 | -6.546556e+00 | -4.370337e-01 | 1.412460e-01 | 6.803781e-01 | 8.658418e+00 |
#测试集
test_a.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
SaleID | 50000.0 | 1.749995e+05 | 14433.901067 | 1.500000e+05 | 1.624998e+05 | 1.749995e+05 | 1.874992e+05 | 1.999990e+05 |
name | 50000.0 | 6.854222e+04 | 61052.808133 | 0.000000e+00 | 1.120350e+04 | 5.224850e+04 | 1.188565e+05 | 1.968050e+05 |
regDate | 50000.0 | 2.003393e+07 | 53688.696852 | 1.991000e+07 | 1.999091e+07 | 2.003091e+07 | 2.007110e+07 | 2.015121e+07 |
model | 50000.0 | 4.684452e+01 | 49.469548 | 0.000000e+00 | 1.000000e+01 | 2.900000e+01 | 6.500000e+01 | 2.460000e+02 |
brand | 50000.0 | 8.056240e+00 | 7.819477 | 0.000000e+00 | 1.000000e+00 | 6.000000e+00 | 1.300000e+01 | 3.900000e+01 |
bodyType | 48587.0 | 1.782185e+00 | 1.760736 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 3.000000e+00 | 7.000000e+00 |
fuelType | 47107.0 | 3.734052e-01 | 0.546442 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 | 6.000000e+00 |
gearbox | 48090.0 | 2.243502e-01 | 0.417158 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 1.000000e+00 |
power | 50000.0 | 1.198836e+02 | 185.097387 | 0.000000e+00 | 7.500000e+01 | 1.090000e+02 | 1.500000e+02 | 2.000000e+04 |
kilometer | 50000.0 | 1.259558e+01 | 3.908979 | 5.000000e-01 | 1.250000e+01 | 1.500000e+01 | 1.500000e+01 | 1.500000e+01 |
regionCode | 50000.0 | 2.590605e+03 | 1876.970263 | 0.000000e+00 | 1.030000e+03 | 2.219000e+03 | 3.857000e+03 | 8.121000e+03 |
seller | 50000.0 | 0.000000e+00 | 0.000000 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
offerType | 50000.0 | 0.000000e+00 | 0.000000 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
creatDate | 50000.0 | 2.016033e+07 | 79.515206 | 2.015061e+07 | 2.016031e+07 | 2.016032e+07 | 2.016033e+07 | 2.016041e+07 |
v_0 | 50000.0 | 4.441823e+01 | 2.429950 | 2.898702e+01 | 4.313962e+01 | 4.461108e+01 | 4.599264e+01 | 5.175168e+01 |
v_1 | 50000.0 | -3.723779e-02 | 3.642562 | -4.137733e+00 | -3.191909e+00 | -3.050756e+00 | 3.997323e+00 | 7.553517e+00 |
v_2 | 50000.0 | 5.053403e-02 | 2.856341 | -4.205728e+00 | -9.712663e-01 | -3.881169e-01 | 2.405479e-01 | 1.839457e+01 |
v_3 | 50000.0 | 8.463990e-02 | 2.026510 | -5.638184e+00 | -1.453453e+00 | 9.788124e-02 | 1.562700e+00 | 9.381599e+00 |
v_4 | 50000.0 | 1.500124e-02 | 1.193026 | -4.287718e+00 | -9.280888e-01 | -7.022500e-02 | 8.637309e-01 | 5.270150e+00 |
v_5 | 50000.0 | 2.486691e-01 | 0.044601 | 0.000000e+00 | 2.437621e-01 | 2.578770e-01 | 2.653276e-01 | 2.916184e-01 |
v_6 | 50000.0 | 4.502089e-02 | 0.051766 | 0.000000e+00 | 4.439681e-05 | 8.147688e-04 | 1.020253e-01 | 1.532654e-01 |
v_7 | 50000.0 | 1.227444e-01 | 0.195972 | 0.000000e+00 | 6.264359e-02 | 9.582813e-02 | 1.254379e-01 | 1.358813e+00 |
v_8 | 50000.0 | 5.799740e-02 | 0.029211 | 0.000000e+00 | 3.508446e-02 | 5.708391e-02 | 7.907748e-02 | 1.563546e-01 |
v_9 | 50000.0 | 6.199954e-02 | 0.035653 | 0.000000e+00 | 3.371386e-02 | 5.876382e-02 | 8.748899e-02 | 2.147753e-01 |
v_10 | 50000.0 | -1.785456e-02 | 3.747985 | -9.160049e+00 | -3.700121e+00 | 1.613212e+00 | 2.832708e+00 | 1.233887e+01 |
v_11 | 50000.0 | -1.374166e-02 | 3.231258 | -5.411964e+00 | -1.971325e+00 | -3.558430e-01 | 1.262914e+00 | 1.885622e+01 |
v_12 | 50000.0 | -1.355400e-02 | 2.515962 | -8.916949e+00 | -1.876703e+00 | -1.427786e-01 | 1.764335e+00 | 1.295050e+01 |
v_13 | 50000.0 | -3.146596e-03 | 1.286597 | -4.123333e+00 | -1.060428e+00 | -3.595595e-02 | 9.414688e-01 | 5.913273e+00 |
v_14 | 50000.0 | 1.516081e-03 | 1.027360 | -6.112667e+00 | -4.379205e-01 | 1.387993e-01 | 6.811630e-01 | 2.624622e+00 |
for col in train.columns[:16]:
print(train[col].value_counts())
2047 1
113949 1
15661 1
13612 1
3371 1
..
8913 1
10960 1
53967 1
56014 1
0 1
Name: SaleID, Length: 150000, dtype: int64
708 282
387 282
55 280
1541 263
203 233
...
5074 1
7123 1
11221 1
13270 1
174485 1
Name: name, Length: 99662, dtype: int64
20000008 180
20000011 158
20000004 157
20000010 157
20000002 155
...
19910807 1
19910902 1
20151209 1
19911011 1
20151201 1
Name: regDate, Length: 3894, dtype: int64
0.0 11762
19.0 9573
4.0 8445
1.0 6038
29.0 5186
...
245.0 2
209.0 2
240.0 2
242.0 2
247.0 1
Name: model, Length: 248, dtype: int64
0 31480
4 16737
14 16089
10 14249
1 13794
6 10217
9 7306
5 4665
13 3817
11 2945
3 2461
7 2361
16 2223
8 2077
25 2064
27 2053
21 1547
15 1458
19 1388
20 1236
12 1109
22 1085
26 966
30 940
17 913
24 772
28 649
32 592
29 406
37 333
2 321
31 318
18 316
36 228
34 227
33 218
23 186
35 180
38 65
39 9
Name: brand, dtype: int64
0.0 41420
1.0 35272
2.0 30324
3.0 13491
4.0 9609
5.0 7607
6.0 6482
7.0 1289
Name: bodyType, dtype: int64
0.0 91656
1.0 46991
2.0 2212
3.0 262
4.0 118
5.0 45
6.0 36
Name: fuelType, dtype: int64
0.0 111623
1.0 32396
Name: gearbox, dtype: int64
75.0 9593
150.0 6495
60.0 6374
140.0 5963
101.0 5537
...
1401.0 1
1164.0 1
1162.0 1
470.0 1
1149.0 1
Name: power, Length: 565, dtype: int64
15.0 96877
12.5 15722
10.0 6459
9.0 5257
8.0 4573
7.0 4084
6.0 3725
5.0 3144
4.0 2718
3.0 2501
2.0 2354
0.5 1840
1.0 746
Name: kilometer, dtype: int64
0.0 111361
1.0 14315
Name: notRepairedDamage, dtype: int64
419 369
764 258
125 137
176 136
462 134
...
6414 1
7063 1
4239 1
5931 1
7267 1
Name: regionCode, Length: 7905, dtype: int64
0 149999
1 1
Name: seller, dtype: int64
0 150000
Name: offerType, dtype: int64
20160403 5848
20160404 5606
20160320 5485
20160312 5383
20160402 5382
...
20151227 1
20151217 1
20160131 1
20160130 1
20160115 1
Name: creatDate, Length: 96, dtype: int64
500 2337
1500 2158
1200 1922
1000 1850
2500 1821
...
25321 1
8886 1
8801 1
37920 1
8188 1
Name: price, Length: 3763, dtype: int64
for col in test_a.columns[:13]:
print(test_a[col].value_counts())
198655 1
167227 1
171353 1
169304 1
191831 1
..
187054 1
180909 1
182956 1
193195 1
196608 1
Name: SaleID, Length: 50000, dtype: int64
55 97
708 96
387 95
1541 88
713 74
..
22270 1
89855 1
42752 1
48899 1
11808 1
Name: name, Length: 37453, dtype: int64
20000004 60
20000011 56
20000012 54
20000009 53
20000005 50
..
19921208 1
20120007 1
19920804 1
19930905 1
19920801 1
Name: regDate, Length: 3835, dtype: int64
0.0 3896
19.0 3245
4.0 3007
1.0 1981
29.0 1742
...
242.0 1
240.0 1
244.0 1
243.0 1
246.0 1
Name: model, Length: 247, dtype: int64
0 10348
4 5763
14 5314
10 4766
1 4532
6 3502
9 2423
5 1569
13 1245
11 919
7 795
3 773
16 771
8 704
25 695
27 650
21 544
15 511
20 450
19 450
12 389
22 363
30 324
17 317
26 303
24 268
28 225
32 193
29 117
31 115
18 106
2 104
37 92
34 77
33 76
36 67
23 62
35 53
38 23
39 2
Name: brand, dtype: int64
0.0 13985
1.0 11882
2.0 9900
3.0 4433
4.0 3303
5.0 2537
6.0 2116
7.0 431
Name: bodyType, dtype: int64
0.0 30656
1.0 15544
2.0 774
3.0 72
4.0 37
6.0 14
5.0 10
Name: fuelType, dtype: int64
0.0 37301
1.0 10789
Name: gearbox, dtype: int64
0 4195
75 3226
150 2180
60 2164
140 1949
...
1000 1
392 1
296 1
375 1
6045 1
Name: power, Length: 445, dtype: int64
15.0 32189
12.5 5346
10.0 2106
9.0 1791
8.0 1523
7.0 1408
6.0 1218
5.0 1093
4.0 898
3.0 849
2.0 717
0.5 616
1.0 246
Name: kilometer, dtype: int64
0.0 37249
- 8031
1.0 4720
Name: notRepairedDamage, dtype: int64
419 146
764 78
188 52
125 51
759 51
...
7753 1
7463 1
7230 1
826 1
112 1
Name: regionCode, Length: 6971, dtype: int64
0 50000
Name: seller, dtype: int64
进行变量划分
-
通过观察上述统计信息发现:
- v0-v14这15个匿名变量的不同取值数量基本等同于样本数量,可以视为连续型变量
非匿名变量里的变量类型划分 - 01.SaleID-具有唯一性的索引变量
- v0-v14这15个匿名变量的不同取值数量基本等同于样本数量,可以视为连续型变量
-
连续型变量:
- 02.regDate-汽车注册日期:用整数表示的日期,观察发现有一些非法的日期表示(如20000008)需先处理为合法日期,再用creatDatae去减,得到交易时的使用时长
- 04.power-发动机功率:取值范围[0,600],功率为0的怎么理解?把0当作缺失值?
- 05.createdDate-汽车开始售卖时间:用整数表示的日期
-
离散型数值变量:
- 03.kilometer-已行驶里程(万公里):取值范围[0.5,15],可视为数值型离散变量
-
分类变量
- 06.name-汽车交易名称:绝大多数(超过75%)情况下具有唯一性,
- 07.brand-汽车品牌:分类变量
- 08.model-车型编码:
- 09.bodyType-车身类型:分类变量,七种不同取值
- 10.fuelType-燃油类型:分类变量七种不同取值
- 11.gearbox-变速箱:分类变量,两种取值
- 12.notRepairedDamage-是否尚有未修复的损坏:分类变量,正常值为0或1.取值为"-"的有 24324 个,应视为缺失值.
- 13.regionCode-地区编码: 7905种不同取值,分类变量.
- 14.seller-销售方:分类变量,两种取值,个体或非个体.
- 15.offerType-报价类型:
price-需要预测的连续型变量
# 借助baseline的字段说明
Image('1584102211.png')
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-gBLQC79o-1585055047038)(output_18_0.png)]
train.bodyType.dtype
dtype('float64')
# 统计
变量名称,值域(唯一值数量),
3) 数据清洗
- 根据上述对各个字段的理解,进行缺失值替换等数据清洗工作.
# 替换 notRepairedDamage 的 '-'
train.notRepairedDamage.replace('-', np.nan, inplace=True)
# 替换 power 列的0值为缺失值
train.power.replace(0, np.nan, inplace=True)
# 将 regDate字段用当前日期减,得到注册日期距今的时长(月份)
train.regDate.sort_values()
103842 19910001
56687 19910001
27475 19910001
68835 19910001
81940 19910001
...
7698 20151210
80851 20151211
133533 20151211
12482 20151211
12617 20151212
Name: regDate, Length: 150000, dtype: int64
train.regDate.value_counts()
20000008 180
20000011 158
20000004 157
20000010 157
20000002 155
...
19910807 1
19910902 1
20151209 1
19911011 1
20151201 1
Name: regDate, Length: 3894, dtype: int64
train.creatDate.value_counts()
20160403 5848
20160404 5606
20160320 5485
20160312 5383
20160402 5382
...
20151227 1
20151217 1
20160131 1
20160130 1
20160115 1
Name: creatDate, Length: 96, dtype: int64
# 定义一个函数做日期转换,对非法的日期格式进行处理
def todate(strdate):
try:
strdate=datetime.strptime(str(strdate),'%Y%m%d').date()
"""将合法的整数型日期格式转换为日期"""
except ValueError:
strdate=datetime.strptime(str(strdate)[:4],'%Y').date()
"""统一将非法的日期格式转换为该年第一天"""
return strdate
todate(20150024)
datetime.date(2015, 1, 1)
#测试成功
train.regDate.apply(lambda x:todate(x))
0 2004-04-02
1 2003-03-01
2 2004-04-03
3 1996-09-08
4 2012-01-03
...
149995 2000-06-07
149996 2009-11-02
149997 2010-10-03
149998 2006-03-12
149999 1999-02-04
Name: regDate, Length: 150000, dtype: object
# 将 regDate 和 creatDate 使用上述转换函数进行替换
train.regDate=train.regDate.apply(lambda x:todate(x))
train.creatDate=train.creatDate.apply(lambda x:todate(x))
# 增加计算列:creatDate 减去 regDate
train['usedDay']=train.creatDate-train.regDate
# 得到一个连续变量,表示车辆使用天数
train['usedDay'].value_counts()
5933 days 115
5911 days 113
5932 days 99
5918 days 99
5938 days 95
...
276 days 1
459 days 1
8772 days 1
8562 days 1
8924 days 1
Name: usedDay, Length: 9054, dtype: int64