阿里云天池竞赛——二手车价格预测项目(个人练习+源代码)

# 导入需要的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold, SelectKBest
from sklearn.feature_selection import mutual_info_regression as MIC

import xgboost as xgb
import lightgbm as lgb

from sklearn.metrics import mean_squared_error, mean_absolute_error
# 读取文件
train = pd.read_csv('used_car_train_20200313.csv',sep=' ')
test = pd.read_csv('used_car_testB_20200421.csv',sep=' ')

print('train:{}'.format(train.shape))
print('test:{}'.format(test.shape))
train:(150000, 31)
test:(50000, 30)
# 查看数据信息
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 31 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   SaleID             150000 non-null  int64  
 1   name               150000 non-null  int64  
 2   regDate            150000 non-null  int64  
 3   model              149999 non-null  float64
 4   brand              150000 non-null  int64  
 5   bodyType           145494 non-null  float64
 6   fuelType           141320 non-null  float64
 7   gearbox            144019 non-null  float64
 8   power              150000 non-null  int64  
 9   kilometer          150000 non-null  float64
 10  notRepairedDamage  150000 non-null  object 
 11  regionCode         150000 non-null  int64  
 12  seller             150000 non-null  int64  
 13  offerType          150000 non-null  int64  
 14  creatDate          150000 non-null  int64  
 15  price              150000 non-null  int64  
 16  v_0                150000 non-null  float64
 17  v_1                150000 non-null  float64
 18  v_2                150000 non-null  float64
 19  v_3                150000 non-null  float64
 20  v_4                150000 non-null  float64
 21  v_5                150000 non-null  float64
 22  v_6                150000 non-null  float64
 23  v_7                150000 non-null  float64
 24  v_8                150000 non-null  float64
 25  v_9                150000 non-null  float64
 26  v_10               150000 non-null  float64
 27  v_11               150000 non-null  float64
 28  v_12               150000 non-null  float64
 29  v_13               150000 non-null  float64
 30  v_14               150000 non-null  float64
dtypes: float64(20), int64(10), object(1)
memory usage: 35.5+ MB
# 查看数据前五行
train.head()
SaleID name regDate model brand bodyType fuelType gearbox power kilometer ... v_5 v_6 v_7 v_8 v_9 v_10 v_11 v_12 v_13 v_14
0 0 736 20040402 30.0 6 1.0 0.0 0.0 60 12.5 ... 0.235676 0.101988 0.129549 0.022816 0.097462 -2.881803 2.804097 -2.420821 0.795292 0.914762
1 1 2262 20030301 40.0 1 2.0 0.0 0.0 0 15.0 ... 0.264777 0.121004 0.135731 0.026597 0.020582 -4.900482 2.096338 -1.030483 -1.722674 0.245522
2 2 14874 20040403 115.0 15 1.0 0.0 0.0 163 12.5 ... 0.251410 0.114912 0.165147 0.062173 0.027075 -4.846749 1.803559 1.565330 -0.832687 -0.229963
3 3 71865 19960908 109.0 10 0.0 0.0 1.0 193 15.0 ... 0.274293 0.110300 0.121964 0.033395 0.000000 -4.509599 1.285940 -0.501868 -2.438353 -0.478699
4 4 111080 20120103 110.0 5 1.0 0.0 0.0 68 5.0 ... 0.228036 0.073205 0.091880 0.078819 0.121534 -1.896240 0.910783 0.931110 2.834518 1.923482

5 rows × 31 columns

# 查看测试集信息
test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   SaleID             50000 non-null  int64  
 1   name               50000 non-null  int64  
 2   regDate            50000 non-null  int64  
 3   model              50000 non-null  float64
 4   brand              50000 non-null  int64  
 5   bodyType           48496 non-null  float64
 6   fuelType           47076 non-null  float64
 7   gearbox            48032 non-null  float64
 8   power              50000 non-null  int64  
 9   kilometer          50000 non-null  float64
 10  notRepairedDamage  50000 non-null  object 
 11  regionCode         50000 non-null  int64  
 12  seller             50000 non-null  int64  
 13  offerType          50000 non-null  int64  
 14  creatDate          50000 non-null  int64  
 15  v_0                50000 non-null  float64
 16  v_1                50000 non-null  float64
 17  v_2                50000 non-null  float64
 18  v_3                50000 non-null  float64
 19  v_4                50000 non-null  float64
 20  v_5                50000 non-null  float64
 21  v_6                50000 non-null  float64
 22  v_7                50000 non-null  float64
 23  v_8                50000 non-null  float64
 24  v_9                50000 non-null  float64
 25  v_10               50000 non-null  float64
 26  v_11               50000 non-null  float64
 27  v_12               50000 non-null  float64
 28  v_13               50000 non-null  float64
 29  v_14               50000 non-null  float64
dtypes: float64(20), int64(9), object(1)
memory usage: 11.4+ MB
train['notRepairedDamage'].value_counts()
0.0    111361
-       24324
1.0     14315
Name: notRepairedDamage, dtype: int64
train['notRepairedDamage'].value_counts()
0.0    111361
-       24324
1.0     14315
Name: notRepairedDamage, dtype: int64
# 将‘-’转化成空值,并将notRepairedDamage特征转换成数值型
train['notRepairedDamage'] = train['notRepairedDamage'].replace('-', np.nan).astype('float')
train['notRepairedDamage'].value_counts()
0.0    111361
1.0     14315
Name: notRepairedDamage, dtype: int64
test['notRepairedDamage'] = test['notRepairedDamage'].replace('-', np.nan).astype('float')
test['notRepairedDamage'].value_counts()
0.0    37224
1.0     4707
Name: notRepairedDamage, dtype: int64
# 查看缺失值特征, 看到全是分类特征
print(train['model'].value_counts())
print(train['bodyType'].value_counts())
print(train['fuelType'].value_counts())
print(train['gearbox'].value_counts())
0.0      11762
19.0      9573
4.0       8445
1.0       6038
29.0      5186
         ...  
240.0        2
209.0        2
245.0        2
242.0        2
247.0        1
Name: model, Length: 248, dtype: int64
0.0    41420
1.0    35272
2.0    30324
3.0    13491
4.0     9609
5.0     7607
6.0     6482
7.0     1289
Name: bodyType, dtype: int64
0.0    91656
1.0    46991
2.0     2212
3.0      262
4.0      118
5.0       45
6.0       36
Name: fuelType, dtype: int64
0.0    111623
1.0     32396
Name: gearbox, dtype: int64
# 获取有缺失值的特征
col_train_null = train.columns[train.isnull().any()].to_list()
col_test_null = test.columns[test.isnull().any()].to_list()

print(col_train_null)
print(col_test_null)
['model', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage']
['bodyType', 'fuelType', 'gearbox', 'notRepairedDamage']
# 使用SimpleImputer进行缺失值填充
imp = SimpleImputer(strategy='most_frequent')

train[col_train_null] = imp.fit_transform(train[col_train_null])
test[col_train_null] = imp.fit_transform(test[col_train_null])
# 检查特征
train.isnull().any().sum()
0
test.isnull().any().sum()
0
# 检查日期列的异常值
train['regDate'].astype('str').str[4:6].value_counts()
03    14949
06    13809
04    12798
05    12614
07    11937
10    11490
00    11347
11    10687
12    10637
09    10522
01     9943
08     9936
02     9331
Name: regDate, dtype: int64
# 定义函数,用于转换月份为零的值
def tran_date(x):
    month = int(x[4:6])
    if month == 0:
        month = 1
    return x[0:4] + '-' + str(month) + '-' + x[6:]
# 日期替换
train['regDate'] = pd.to_datetime(train['regDate'].astype('str').apply(tran_date))
test['regDate'] = pd.to_datetime(test['regDate'].astype('str').apply(tran_date))
train['creatDate'
  • 5
    点赞
  • 46
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值