# 导入需要的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold, SelectKBest
from sklearn.feature_selection import mutual_info_regression as MIC
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
# 读取文件
train = pd.read_csv('used_car_train_20200313.csv',sep=' ')
test = pd.read_csv('used_car_testB_20200421.csv',sep=' ')
print('train:{}'.format(train.shape))
print('test:{}'.format(test.shape))
train:(150000, 31)
test:(50000, 30)
# 查看数据信息
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 31 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 SaleID 150000 non-null int64
1 name 150000 non-null int64
2 regDate 150000 non-null int64
3 model 149999 non-null float64
4 brand 150000 non-null int64
5 bodyType 145494 non-null float64
6 fuelType 141320 non-null float64
7 gearbox 144019 non-null float64
8 power 150000 non-null int64
9 kilometer 150000 non-null float64
10 notRepairedDamage 150000 non-null object
11 regionCode 150000 non-null int64
12 seller 150000 non-null int64
13 offerType 150000 non-null int64
14 creatDate 150000 non-null int64
15 price 150000 non-null int64
16 v_0 150000 non-null float64
17 v_1 150000 non-null float64
18 v_2 150000 non-null float64
19 v_3 150000 non-null float64
20 v_4 150000 non-null float64
21 v_5 150000 non-null float64
22 v_6 150000 non-null float64
23 v_7 150000 non-null float64
24 v_8 150000 non-null float64
25 v_9 150000 non-null float64
26 v_10 150000 non-null float64
27 v_11 150000 non-null float64
28 v_12 150000 non-null float64
29 v_13 150000 non-null float64
30 v_14 150000 non-null float64
dtypes: float64(20), int64(10), object(1)
memory usage: 35.5+ MB
# 查看数据前五行
train.head()
SaleID | name | regDate | model | brand | bodyType | fuelType | gearbox | power | kilometer | ... | v_5 | v_6 | v_7 | v_8 | v_9 | v_10 | v_11 | v_12 | v_13 | v_14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 736 | 20040402 | 30.0 | 6 | 1.0 | 0.0 | 0.0 | 60 | 12.5 | ... | 0.235676 | 0.101988 | 0.129549 | 0.022816 | 0.097462 | -2.881803 | 2.804097 | -2.420821 | 0.795292 | 0.914762 |
1 | 1 | 2262 | 20030301 | 40.0 | 1 | 2.0 | 0.0 | 0.0 | 0 | 15.0 | ... | 0.264777 | 0.121004 | 0.135731 | 0.026597 | 0.020582 | -4.900482 | 2.096338 | -1.030483 | -1.722674 | 0.245522 |
2 | 2 | 14874 | 20040403 | 115.0 | 15 | 1.0 | 0.0 | 0.0 | 163 | 12.5 | ... | 0.251410 | 0.114912 | 0.165147 | 0.062173 | 0.027075 | -4.846749 | 1.803559 | 1.565330 | -0.832687 | -0.229963 |
3 | 3 | 71865 | 19960908 | 109.0 | 10 | 0.0 | 0.0 | 1.0 | 193 | 15.0 | ... | 0.274293 | 0.110300 | 0.121964 | 0.033395 | 0.000000 | -4.509599 | 1.285940 | -0.501868 | -2.438353 | -0.478699 |
4 | 4 | 111080 | 20120103 | 110.0 | 5 | 1.0 | 0.0 | 0.0 | 68 | 5.0 | ... | 0.228036 | 0.073205 | 0.091880 | 0.078819 | 0.121534 | -1.896240 | 0.910783 | 0.931110 | 2.834518 | 1.923482 |
5 rows × 31 columns
# 查看测试集信息
test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 30 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 SaleID 50000 non-null int64
1 name 50000 non-null int64
2 regDate 50000 non-null int64
3 model 50000 non-null float64
4 brand 50000 non-null int64
5 bodyType 48496 non-null float64
6 fuelType 47076 non-null float64
7 gearbox 48032 non-null float64
8 power 50000 non-null int64
9 kilometer 50000 non-null float64
10 notRepairedDamage 50000 non-null object
11 regionCode 50000 non-null int64
12 seller 50000 non-null int64
13 offerType 50000 non-null int64
14 creatDate 50000 non-null int64
15 v_0 50000 non-null float64
16 v_1 50000 non-null float64
17 v_2 50000 non-null float64
18 v_3 50000 non-null float64
19 v_4 50000 non-null float64
20 v_5 50000 non-null float64
21 v_6 50000 non-null float64
22 v_7 50000 non-null float64
23 v_8 50000 non-null float64
24 v_9 50000 non-null float64
25 v_10 50000 non-null float64
26 v_11 50000 non-null float64
27 v_12 50000 non-null float64
28 v_13 50000 non-null float64
29 v_14 50000 non-null float64
dtypes: float64(20), int64(9), object(1)
memory usage: 11.4+ MB
train['notRepairedDamage'].value_counts()
0.0 111361
- 24324
1.0 14315
Name: notRepairedDamage, dtype: int64
train['notRepairedDamage'].value_counts()
0.0 111361
- 24324
1.0 14315
Name: notRepairedDamage, dtype: int64
# 将‘-’转化成空值,并将notRepairedDamage特征转换成数值型
train['notRepairedDamage'] = train['notRepairedDamage'].replace('-', np.nan).astype('float')
train['notRepairedDamage'].value_counts()
0.0 111361
1.0 14315
Name: notRepairedDamage, dtype: int64
test['notRepairedDamage'] = test['notRepairedDamage'].replace('-', np.nan).astype('float')
test['notRepairedDamage'].value_counts()
0.0 37224
1.0 4707
Name: notRepairedDamage, dtype: int64
# 查看缺失值特征, 看到全是分类特征
print(train['model'].value_counts())
print(train['bodyType'].value_counts())
print(train['fuelType'].value_counts())
print(train['gearbox'].value_counts())
0.0 11762
19.0 9573
4.0 8445
1.0 6038
29.0 5186
...
240.0 2
209.0 2
245.0 2
242.0 2
247.0 1
Name: model, Length: 248, dtype: int64
0.0 41420
1.0 35272
2.0 30324
3.0 13491
4.0 9609
5.0 7607
6.0 6482
7.0 1289
Name: bodyType, dtype: int64
0.0 91656
1.0 46991
2.0 2212
3.0 262
4.0 118
5.0 45
6.0 36
Name: fuelType, dtype: int64
0.0 111623
1.0 32396
Name: gearbox, dtype: int64
# 获取有缺失值的特征
col_train_null = train.columns[train.isnull().any()].to_list()
col_test_null = test.columns[test.isnull().any()].to_list()
print(col_train_null)
print(col_test_null)
['model', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage']
['bodyType', 'fuelType', 'gearbox', 'notRepairedDamage']
# 使用SimpleImputer进行缺失值填充
imp = SimpleImputer(strategy='most_frequent')
train[col_train_null] = imp.fit_transform(train[col_train_null])
test[col_train_null] = imp.fit_transform(test[col_train_null])
# 检查特征
train.isnull().any().sum()
0
test.isnull().any().sum()
0
# 检查日期列的异常值
train['regDate'].astype('str').str[4:6].value_counts()
03 14949
06 13809
04 12798
05 12614
07 11937
10 11490
00 11347
11 10687
12 10637
09 10522
01 9943
08 9936
02 9331
Name: regDate, dtype: int64
# 定义函数,用于转换月份为零的值
def tran_date(x):
month = int(x[4:6])
if month == 0:
month = 1
return x[0:4] + '-' + str(month) + '-' + x[6:]
# 日期替换
train['regDate'] = pd.to_datetime(train['regDate'].astype('str').apply(tran_date))
test['regDate'] = pd.to_datetime(test['regDate'].astype('str').apply(tran_date))
train['creatDate'