3.3 代码示例
3.3.0导入数据
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from operator import itemgetter
#如果你用de是jupyter lab则没必要加%matplotlib inline
import os
#结果保存路径
output_path='G:/newjourney/Datawhale/output'
if not os.path.exists(output_path):
os.makedirs(output_path)
## 1)载入训练集和测试集
path='G:/newjourney/Datawhale/'
Train_data=pd.read_csv(path+'used_car_train_20200313.csv',sep=' ')
Test_data=pd.read_csv(path+'used_car_testB_20200421.csv',sep=' ')
print(Train_data.shape)
print(Test_data.shape)
(150000, 31)
(50000, 30)
Train_data.head().append(Train_data.tail())
SaleID | name | regDate | model | brand | bodyType | fuelType | gearbox | power | kilometer | ... | v_5 | v_6 | v_7 | v_8 | v_9 | v_10 | v_11 | v_12 | v_13 | v_14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 736 | 20040402 | 30.0 | 6 | 1.0 | 0.0 | 0.0 | 60 | 12.5 | ... | 0.235676 | 0.101988 | 0.129549 | 0.022816 | 0.097462 | -2.881803 | 2.804097 | -2.420821 | 0.795292 | 0.914762 |
1 | 1 | 2262 | 20030301 | 40.0 | 1 | 2.0 | 0.0 | 0.0 | 0 | 15.0 | ... | 0.264777 | 0.121004 | 0.135731 | 0.026597 | 0.020582 | -4.900482 | 2.096338 | -1.030483 | -1.722674 | 0.245522 |
2 | 2 | 14874 | 20040403 | 115.0 | 15 | 1.0 | 0.0 | 0.0 | 163 | 12.5 | ... | 0.251410 | 0.114912 | 0.165147 | 0.062173 | 0.027075 | -4.846749 | 1.803559 | 1.565330 | -0.832687 | -0.229963 |
3 | 3 | 71865 | 19960908 | 109.0 | 10 | 0.0 | 0.0 | 1.0 | 193 | 15.0 | ... | 0.274293 | 0.110300 | 0.121964 | 0.033395 | 0.000000 | -4.509599 | 1.285940 | -0.501868 | -2.438353 | -0.478699 |
4 | 4 | 111080 | 20120103 | 110.0 | 5 | 1.0 | 0.0 | 0.0 | 68 | 5.0 | ... | 0.228036 | 0.073205 | 0.091880 | 0.078819 | 0.121534 | -1.896240 | 0.910783 | 0.931110 | 2.834518 | 1.923482 |
149995 | 149995 | 163978 | 20000607 | 121.0 | 10 | 4.0 | 0.0 | 1.0 | 163 | 15.0 | ... | 0.280264 | 0.000310 | 0.048441 | 0.071158 | 0.019174 | 1.988114 | -2.983973 | 0.589167 | -1.304370 | -0.302592 |
149996 | 149996 | 184535 | 20091102 | 116.0 | 11 | 0.0 | 0.0 | 0.0 | 125 | 10.0 | ... | 0.253217 | 0.000777 | 0.084079 | 0.099681 | 0.079371 | 1.839166 | -2.774615 | 2.553994 | 0.924196 | -0.272160 |
149997 | 149997 | 147587 | 20101003 | 60.0 | 11 | 1.0 | 1.0 | 0.0 | 90 | 6.0 | ... | 0.233353 | 0.000705 | 0.118872 | 0.100118 | 0.097914 | 2.439812 | -1.630677 | 2.290197 | 1.891922 | 0.414931 |
149998 | 149998 | 45907 | 20060312 | 34.0 | 10 | 3.0 | 1.0 | 0.0 | 156 | 15.0 | ... | 0.256369 | 0.000252 | 0.081479 | 0.083558 | 0.081498 | 2.075380 | -2.633719 | 1.414937 | 0.431981 | -1.659014 |
149999 | 149999 | 177672 | 19990204 | 19.0 | 28 | 6.0 | 0.0 | 1.0 | 193 | 12.5 | ... | 0.284475 | 0.000000 | 0.040072 | 0.062543 | 0.025819 | 1.978453 | -3.179913 | 0.031724 | -1.483350 | -0.342674 |
10 rows × 31 columns
Train_data.columns
Index(['SaleID', 'name', 'regDate', 'model', 'brand', 'bodyType', 'fuelType',
'gearbox', 'power', 'kilometer', 'notRepairedDamage', 'regionCode',
'seller', 'offerType', 'creatDate', 'price', 'v_0', 'v_1', 'v_2', 'v_3',
'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12',
'v_13', 'v_14'],
dtype='object')
3.3.1 利用箱线图删除异常值
#这里是包装了一个异常值处理的代码
def outliers_proc(data,col_name,scale=3):
'''
用于清洗异常值,默认用box_plot(scale=3)进行清洗
:param data:接收pandas数据格式
:param col_name:pandas 列名
:param scale:尺度
:return:
'''
def box_plot_outliers(data_ser,box_scale):
'''
利用箱线图去除异常值
:param data_ser:接收pandas.Series 数据格式
:param box_scale:箱线图尺度
:return:
'''
iqr=box_scale*(data_ser.quantile(0.75)-data_ser.quantile(0.25)) # 四分位距
val_low=data_ser.quantile(0.25)-iqr# 最小值边界
val_up=data_ser.quantile(0.75)+iqr # 最大值边界
rule_low=(data_ser<val_low)
rule_up=(data_ser>val_up)
return (rule_low,rule_up),(val_low,val_up)
data_n=data.copy()
data_series=data_n[col_name] # 取要处理的列的数据
rule,value=box_plot_outliers(data_series,box_scale=scale)
index=np.arange(data_series.shape[0])[rule[0]|rule[1]] # 返回上下边界外数据的索引号
print("Delete number is:{}".format(len(index))) # 输出删除条目数
data_n=data_n.drop(index) #删除数据
data_n.reset_index(drop=True,inplace=True) #恢复索引
print("Now column number is:{}".format(data_n.shape[0])) # 输出删除后的条目数
# 输出超出下边界异常值的数据特征
index_low = np.arange(data_series.shape[0])[rule[0]] # 超出下边界的异常值索引
outliers = data_series.iloc[index_low]
print("Description of data less than the lower bound is :")
print(pd.Series(outliers).describe())
# 输出超出上边界异常值的数据特征
index_up=np.arange(data_series.shape[0])[rule[1]]
outliers=data_series.iloc[index_up]
print("Description of data larger than the upper bound is:")
print(pd.Series(outliers).describe())
fig,ax=plt.subplots(1,2,figsize=(10,7))
sns.boxplot(y=data[col_name],data=data,palette="Set1",ax=ax[0])
sns.boxplot(y=data_n[col_name],data=data_n,palette='Set1',ax=ax[1])
return data_n
# 我们这里删掉一些异常数据,以power为例
#能不能删,自行判断
#但是要注意:test的数据不能删除
Train_data=outliers_proc(Train_data,'power',scale=3)
Delete number is:963
Now column number is:149037
Description of data less than the lower bound is :
count 0.0
mean NaN
std NaN
min NaN
25% NaN
50% NaN
75% NaN
max NaN
Name: power, dtype: float64
Description of data larger than the upper bound is:
count 963.000000
mean 846.836968
std 1929.418081
min 376.000000
25% 400.000000
50% 436.000000
75% 514.000000
max 19312.000000
Name: power, dtype: float64
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-bscj7Ew8-1588387457094)(output_8_1.png)]
3.3.2特征构造
# 训练集和测试集利用concat放在一起,方便构造特征???
Train_data['train']=1
Test_data['train']=0#这是各自新增加了一列'train'吗??
data=pd.concat([Train_data,Test_data],ignore_index=True) # 忽略索引
G:\baidudownload2\anaconda\lib\site-packages\ipykernel_launcher.py:4: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.
To accept the future behavior, pass 'sort=False'.
To retain the current behavior and silence the warning, pass 'sort=True'.
after removing the cwd from sys.path.
Train_data['train']
0 1
1 1
2 1
3 1
4 1
5 1
6 1
7 1
8 1
9 1
10 1
11 1
12 1
13 1
14 1
15 1
16 1
17 1
18 1
19 1
20 1
21 1
22 1
23 1
24 1
25 1
26 1
27 1
28 1
29 1
..
149007 1
149008 1
149009 1
149010 1
149011 1
149012 1
149013 1
149014 1
149015 1
149016 1
149017 1
149018 1
149019 1
149020 1
149021 1
149022 1
149023 1
149024 1
149025 1
149026 1
149027 1
149028 1
149029 1
149030 1
149031 1
149032 1
149033 1
149034 1
149035 1
149036 1
Name: train, Length: 149037, dtype: int64
#使用时间:data['creatDate']-data['regDate'],反应汽车使用时间,一般来说价格与使用时间成反比
#需要注意:数据里有时间出错的格式,所以我们需要errors=’coerce‘
data['used_time'] = (pd.to_datetime(data['creatDate'], format='%Y%m%d', errors='coerce') -\
pd.to_datetime(data['regDate'], format='%Y%m%d', errors='coerce')).dt.days
'''
看一下空数据,有15K个样本是有问题的,我们可以选择删除,也可以选择放着(因为XGBoost之类的决策树,其本身就能处理缺失值)。
但这里不建议删除,因为删除缺失数据占总样本量过大,7.5%
'''
data['used_time'].isnull().sum()
15054
#从邮编中提取城市信息,相当于加入了先验知识
data['city']=data['regionCode'].apply(lambda x:str(x)[:-3])
data=data
data.columns
Index(['SaleID', 'bodyType', 'brand', 'creatDate', 'fuelType', 'gearbox',
'kilometer', 'model', 'name', 'notRepairedDamage', 'offerType', 'power',
'price', 'regDate', 'regionCode', 'seller', 'train', 'v_0', 'v_1',
'v_10', 'v_11', 'v_12', 'v_13', 'v_14', 'v_2', 'v_3', 'v_4', 'v_5',
'v_6', 'v_7', 'v_8', 'v_9', 'used_time', 'city'],
dtype='object')
#计算某品牌的销售统计量,当然也可计算其他特征的统计量
#这里以训练集的数据计算统计量
Train_gb=Train_data.groupby('brand')
all_info={}
for kind,kind_data in Train_gb:
info={}
kind_data=kind_data[kind_data['price']>0]
info['brand_amount']=len(kind_data)
info['brand_price_max']=kind_data.price.max()
info['brand_price_median']=kind_data.price.median()
info['brand_price_min']=kind_data.price.min()
info['brand_price_sum']=kind_data.price.sum()
info['brand_price_std']=kind_data.price.std()
info['brand_price_average']=round(kind_data.price.sum()/(len(kind_data)+1),2)
all_info[kind]=info
brand_fe=pd.DataFrame(all_info).T.reset_index().rename(columns={'index':'brand'})
#这个一步到位可以学习一下,转置-->重置索引-->重新命名
data=data.merge(brand_fe,how='left',on='brand')
kind_data
SaleID | name | regDate | model | brand | bodyType | fuelType | gearbox | power | kilometer | ... | v_6 | v_7 | v_8 | v_9 | v_10 | v_11 | v_12 | v_13 | v_14 | train | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
5116 | 5144 | 38387 | 19950009 | 244.0 | 39 | NaN | 0.0 | 0.0 | 0 | 0.5 | ... | 0.127947 | 1.126137 | 0.069992 | 0.046578 | 1.717704 | 18.197699 | 5.452985 | 1.532134 | -3.620942 | 1 |
19963 | 20079 | 38387 | 20150402 | 244.0 | 39 | 6.0 | 0.0 | 0.0 | 26 | 4.0 | ... | 0.128117 | 0.000000 | 0.060331 | 0.184668 | -6.214428 | 0.697001 | -0.249955 | 3.577009 | -2.306780 | 1 |
49501 | 49803 | 181810 | 19961108 | 19.0 | 39 | 2.0 | 0.0 | 0.0 | 39 | 6.0 | ... | 0.000000 | 0.076805 | 0.048503 | 0.035279 | 3.142073 | -1.259462 | -1.508440 | -0.897785 | -0.659889 | 1 |
65124 | 65532 | 50778 | 19910707 | 19.0 | 39 | 4.0 | NaN | NaN | 0 | 2.0 | ... | 0.000499 | 0.037632 | 0.090940 | 0.036046 | 1.606401 | -3.436782 | 1.914129 | -0.374511 | 0.117196 | 1 |
75775 | 76258 | 65608 | 20081103 | 1.0 | 39 | 5.0 | NaN | NaN | 60 | 8.0 | ... | 0.000233 | 0.037664 | 0.074347 | 0.072366 | 2.118526 | -2.783746 | 0.392918 | 0.458408 | 1.005671 | 1 |
90325 | 90920 | 172758 | 20040710 | 1.0 | 39 | 1.0 | 1.0 | 0.0 | 60 | 15.0 | ... | 0.000000 | 0.156222 | 0.057622 | 0.073552 | 3.187338 | -0.808931 | -0.350379 | 0.075403 | 0.730990 | 1 |
98386 | 99049 | 22825 | 19981203 | 1.0 | 39 | 0.0 | 0.0 | 0.0 | 45 | 15.0 | ... | 0.000000 | 0.086492 | 0.028276 | 0.064595 | 3.399385 | -1.059497 | -2.862146 | -0.758236 | 0.000658 | 1 |
98765 | 99431 | 59082 | 19990002 | 1.0 | 39 | 0.0 | NaN | 0.0 | 0 | 15.0 | ... | 0.000000 | 0.071455 | 0.049227 | 0.045501 | 2.968157 | -1.548087 | -1.400021 | -0.674703 | 0.258403 | 1 |
126237 | 127071 | 38387 | 19950004 | 244.0 | 39 | NaN | 0.0 | 0.0 | 0 | 0.5 | ... | 0.127947 | 1.126137 | 0.069992 | 0.046578 | 1.717704 | 18.197699 | 5.452985 | 1.532134 | -3.620942 | 1 |
9 rows × 32 columns
all_info[kind]
{'brand_amount': 9,
'brand_price_max': 14500,
'brand_price_median': 1900.0,
'brand_price_min': 750,
'brand_price_sum': 39480,
'brand_price_std': 5520.867232600327,
'brand_price_average': 3948.0}
brand_fe
brand | brand_amount | brand_price_average | brand_price_max | brand_price_median | brand_price_min | brand_price_std | brand_price_sum | |
---|---|---|---|---|---|---|---|---|
0 | 0 | 31429.0 | 5527.19 | 68500.0 | 3199.0 | 13.0 | 6261.371627 | 173719698.0 |
1 | 1 | 13656.0 | 9082.86 | 84000.0 | 6399.0 | 15.0 | 8988.865406 | 124044603.0 |
2 | 2 | 318.0 | 11806.40 | 55800.0 | 7500.0 | 35.0 | 10576.224444 | 3766241.0 |
3 | 3 | 2461.0 | 6480.19 | 37500.0 | 4990.0 | 65.0 | 5396.327503 | 15954226.0 |
4 | 4 | 16575.0 | 8342.13 | 99999.0 | 5999.0 | 12.0 | 8089.863295 | 138279069.0 |
5 | 5 | 4662.0 | 3305.67 | 31500.0 | 2300.0 | 20.0 | 3344.689763 | 15414322.0 |
6 | 6 | 10193.0 | 3576.37 | 35990.0 | 1800.0 | 13.0 | 4562.233331 | 36457518.0 |
7 | 7 | 2360.0 | 4195.64 | 38900.0 | 2600.0 | 60.0 | 4752.584154 | 9905909.0 |
8 | 8 | 2070.0 | 4836.88 | 99999.0 | 2270.0 | 30.0 | 6053.233424 | 10017173.0 |
9 | 9 | 7299.0 | 2439.08 | 68530.0 | 1400.0 | 50.0 | 2975.342884 | 17805271.0 |
10 | 10 | 13994.0 | 8076.76 | 92900.0 | 5200.0 | 15.0 | 8244.695287 | 113034210.0 |
11 | 11 | 2944.0 | 4549.41 | 34500.0 | 2900.0 | 30.0 | 4722.160492 | 13398006.0 |
12 | 12 | 1108.0 | 4052.57 | 27490.0 | 2625.0 | 50.0 | 4066.959950 | 4494303.0 |
13 | 13 | 3813.0 | 2799.11 | 35000.0 | 1600.0 | 20.0 | 3073.915196 | 10675790.0 |
14 | 14 | 16073.0 | 3053.17 | 38990.0 | 1700.0 | 12.0 | 3605.595127 | 49076652.0 |
15 | 15 | 1458.0 | 9851.83 | 45000.0 | 8500.0 | 100.0 | 5425.058140 | 14373814.0 |
16 | 16 | 2219.0 | 3638.90 | 17900.0 | 2999.0 | 20.0 | 2450.906089 | 8078352.0 |
17 | 17 | 913.0 | 3641.88 | 55800.0 | 2200.0 | 15.0 | 3952.913330 | 3328679.0 |
18 | 18 | 315.0 | 4807.12 | 34599.0 | 1999.0 | 50.0 | 6358.409761 | 1519049.0 |
19 | 19 | 1386.0 | 5211.45 | 42350.0 | 2800.0 | 20.0 | 6186.538949 | 7228288.0 |
20 | 20 | 1235.0 | 3473.09 | 37800.0 | 1750.0 | 15.0 | 4400.529809 | 4292737.0 |
21 | 21 | 1546.0 | 5724.94 | 35999.0 | 4225.0 | 50.0 | 5257.235026 | 8856481.0 |
22 | 22 | 1085.0 | 6025.25 | 43900.0 | 3950.0 | 50.0 | 5877.140886 | 6543426.0 |
23 | 23 | 183.0 | 3245.28 | 64000.0 | 1200.0 | 99.0 | 7333.695140 | 597132.0 |
24 | 24 | 630.0 | 32365.73 | 99999.0 | 27450.0 | 15.0 | 19855.495201 | 20422776.0 |
25 | 25 | 2059.0 | 3648.32 | 22500.0 | 2500.0 | 25.0 | 3556.249839 | 7515546.0 |
26 | 26 | 878.0 | 8239.81 | 99999.0 | 5000.0 | 11.0 | 10282.987274 | 7242792.0 |
27 | 27 | 2049.0 | 5298.81 | 62900.0 | 4200.0 | 35.0 | 4853.289240 | 10862559.0 |
28 | 28 | 633.0 | 5321.70 | 39900.0 | 3790.0 | 80.0 | 4509.036301 | 3373957.0 |
29 | 29 | 406.0 | 6041.84 | 19990.0 | 5250.0 | 500.0 | 3639.737722 | 2459028.0 |
30 | 30 | 940.0 | 4186.13 | 23200.0 | 3295.0 | 50.0 | 3659.577291 | 3939145.0 |
31 | 31 | 318.0 | 1755.97 | 11000.0 | 1000.0 | 50.0 | 1829.079211 | 560155.0 |
32 | 32 | 588.0 | 4006.95 | 33500.0 | 2350.0 | 50.0 | 4394.596002 | 2360095.0 |
33 | 33 | 201.0 | 9107.93 | 65000.0 | 5600.0 | 980.0 | 9637.135323 | 1839801.0 |
34 | 34 | 227.0 | 1016.56 | 2900.0 | 999.0 | 60.0 | 554.118445 | 231776.0 |
35 | 35 | 180.0 | 1646.28 | 28900.0 | 950.0 | 50.0 | 3325.933365 | 297977.0 |
36 | 36 | 228.0 | 3563.32 | 20900.0 | 2250.0 | 150.0 | 3922.715389 | 816001.0 |
37 | 37 | 331.0 | 16180.25 | 86500.0 | 13250.0 | 550.0 | 13541.180315 | 5371844.0 |
38 | 38 | 65.0 | 3266.97 | 8999.0 | 2850.0 | 99.0 | 2140.083145 | 215620.0 |
39 | 39 | 9.0 | 3948.00 | 14500.0 | 1900.0 | 750.0 | 5520.867233 | 39480.0 |
data
SaleID | bodyType | brand | creatDate | fuelType | gearbox | kilometer | model | name | notRepairedDamage | ... | v_9 | used_time | city | brand_amount | brand_price_average | brand_price_max | brand_price_median | brand_price_min | brand_price_std | brand_price_sum | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1.0 | 6 | 20160404 | 0.0 | 0.0 | 12.5 | 30.0 | 736 | 0.0 | ... | 0.097462 | 4385.0 | 1 | 10193.0 | 3576.37 | 35990.0 | 1800.0 | 13.0 | 4562.233331 | 36457518.0 |
1 | 1 | 2.0 | 1 | 20160309 | 0.0 | 0.0 | 15.0 | 40.0 | 2262 | - | ... | 0.020582 | 4757.0 | 4 | 13656.0 | 9082.86 | 84000.0 | 6399.0 | 15.0 | 8988.865406 | 124044603.0 |
2 | 2 | 1.0 | 15 | 20160402 | 0.0 | 0.0 | 12.5 | 115.0 | 14874 | 0.0 | ... | 0.027075 | 4382.0 | 2 | 1458.0 | 9851.83 | 45000.0 | 8500.0 | 100.0 | 5425.058140 | 14373814.0 |
3 | 3 | 0.0 | 10 | 20160312 | 0.0 | 1.0 | 15.0 | 109.0 | 71865 | 0.0 | ... | 0.000000 | 7125.0 | 13994.0 | 8076.76 | 92900.0 | 5200.0 | 15.0 | 8244.695287 | 113034210.0 | |
4 | 4 | 1.0 | 5 | 20160313 | 0.0 | 0.0 | 5.0 | 110.0 | 111080 | 0.0 | ... | 0.121534 | 1531.0 | 6 | 4662.0 | 3305.67 | 31500.0 | 2300.0 | 20.0 | 3344.689763 | 15414322.0 |
5 | 5 | 0.0 | 10 | 20160319 | 1.0 | 0.0 | 10.0 | 24.0 | 137642 | 0.0 | ... | 0.048769 | 2482.0 | 3 | 13994.0 | 8076.76 | 92900.0 | 5200.0 | 15.0 | 8244.695287 | 113034210.0 |
6 | 6 | 0.0 | 4 | 20160317 | 0.0 | 1.0 | 15.0 | 13.0 | 2402 | 0.0 | ... | 0.028174 | 6185.0 | 3 | 16575.0 | 8342.13 | 99999.0 | 5999.0 | 12.0 | 8089.863295 | 138279069.0 |
7 | 7 | 1.0 | 14 | 20160326 | 0.0 | 0.0 | 15.0 | 26.0 | 165346 | 0.0 | ... | 0.082413 | 6108.0 | 4 | 16073.0 | 3053.17 | 38990.0 | 1700.0 | 12.0 | 3605.595127 | 49076652.0 |
8 | 8 | 2.0 | 1 | 20160326 | 1.0 | 1.0 | 15.0 | 19.0 | 2974 | 0.0 | ... | 0.024388 | 4798.0 | 4 | 13656.0 | 9082.86 | 84000.0 | 6399.0 | 15.0 | 8988.865406 | 124044603.0 |
9 | 9 | 5.0 | 7 | 20160402 | 0.0 | 0.0 | 15.0 | 7.0 | 82021 | 0.0 | ... | 0.098727 | 6666.0 | 2360.0 | 4195.64 | 38900.0 | 2600.0 | 60.0 | 4752.584154 | 9905909.0 | |
10 | 10 | 3.0 | 9 | 20160320 | 1.0 | 0.0 | 15.0 | 19.0 | 18961 | 0.0 | ... | 0.060794 | 3874.0 | 1 | 7299.0 | 2439.08 | 68530.0 | 1400.0 | 50.0 | 2975.342884 | 17805271.0 |
11 | 11 | 2.0 | 6 | 20160326 | 0.0 | 0.0 | 2.0 | 1.0 | 74495 | 0.0 | ... | 0.030019 | 2936.0 | 5 | 10193.0 | 3576.37 | 35990.0 | 1800.0 | 13.0 | 4562.233331 | 36457518.0 |
12 | 12 | 1.0 | 14 | 20160321 | 0.0 | 0.0 | 6.0 | 48.0 | 120103 | 0.0 | ... | 0.074250 | 5493.0 | 2 | 16073.0 | 3053.17 | 38990.0 | 1700.0 | 12.0 | 3605.595127 | 49076652.0 |
13 | 13 | 0.0 | 1 | 20160326 | 0.0 | 0.0 | 15.0 | 65.0 | 8129 | 1.0 | ... | 0.051359 | 4154.0 | 3 | 13656.0 | 9082.86 | 84000.0 | 6399.0 | 15.0 | 8988.865406 | 124044603.0 |
14 | 14 | NaN | 0 | 20160402 | NaN | 0.0 | 15.0 | 1.0 | 1896 | - | ... | 0.020187 | NaN | 3 | 31429.0 | 5527.19 | 68500.0 | 3199.0 | 13.0 | 6261.371627 | 173719698.0 |
15 | 15 | 6.0 | 27 | 20160331 | 0.0 | 0.0 | 15.0 | 138.0 | 84546 | 0.0 | ... | 0.075108 | 6260.0 | 2 | 2049.0 | 5298.81 | 62900.0 | 4200.0 | 35.0 | 4853.289240 | 10862559.0 |
16 | 16 | 0.0 | 1 | 20160306 | 1.0 | 1.0 | 12.5 | 105.0 | 10036 | 0.0 | ... | 0.088695 | 1638.0 | 13656.0 | 9082.86 | 84000.0 | 6399.0 | 15.0 | 8988.865406 | 124044603.0 | |
17 | 17 | 6.0 | 21 | 20160404 | 0.0 | 0.0 | 15.0 | 114.0 | 29756 | 0.0 | ... | 0.064552 | 3896.0 | 1546.0 | 5724.94 | 35999.0 | 4225.0 | 50.0 | 5257.235026 | 8856481.0 | |
18 | 18 | 1.0 | 14 | 20160313 | 0.0 | 0.0 | 15.0 | 48.0 | 104088 | - | ... | 0.076203 | 6363.0 | 1 | 16073.0 | 3053.17 | 38990.0 | 1700.0 | 12.0 | 3605.595127 | 49076652.0 |
19 | 19 | 0.0 | 0 | 20160328 | 0.0 | 0.0 | 15.0 | 0.0 | 15738 | 0.0 | ... | 0.035464 | 5866.0 | 31429.0 | 5527.19 | 68500.0 | 3199.0 | 13.0 | 6261.371627 | 173719698.0 | |
20 | 20 | 1.0 | 14 | 20160311 | 0.0 | 0.0 | 15.0 | 48.0 | 148669 | 0.0 | ... | 0.062781 | NaN | 16073.0 | 3053.17 | 38990.0 | 1700.0 | 12.0 | 3605.595127 | 49076652.0 | |
21 | 21 | 0.0 | 0 | 20160403 | NaN | NaN | 15.0 | 8.0 | 12784 | - | ... | 0.050713 | 4925.0 | 1 | 31429.0 | 5527.19 | 68500.0 | 3199.0 | 13.0 | 6261.371627 | 173719698.0 |
22 | 22 | 1.0 | 14 | 20160326 | 0.0 | 0.0 | 15.0 | 26.0 | 131637 | - | ... | 0.093230 | NaN | 5 | 16073.0 | 3053.17 | 38990.0 | 1700.0 | 12.0 | 3605.595127 | 49076652.0 |
23 | 23 | 5.0 | 7 | 20160317 | 0.0 | 0.0 | 15.0 | 78.0 | 8949 | 1.0 | ... | 0.094000 | 8021.0 | 1 | 2360.0 | 4195.64 | 38900.0 | 2600.0 | 60.0 | 4752.584154 | 9905909.0 |
24 | 24 | 1.0 | 16 | 20160327 | 1.0 | 1.0 | 5.0 | 21.0 | 24822 | 0.0 | ... | 0.072564 | 1603.0 | 2219.0 | 3638.90 | 17900.0 | 2999.0 | 20.0 | 2450.906089 | 8078352.0 | |
25 | 25 | 2.0 | 3 | 20160311 | 1.0 | 0.0 | 15.0 | 3.0 | 12877 | 0.0 | ... | 0.095350 | 3712.0 | 2461.0 | 6480.19 | 37500.0 | 4990.0 | 65.0 | 5396.327503 | 15954226.0 | |
26 | 26 | 0.0 | 4 | 20160401 | 1.0 | 0.0 | 15.0 | 4.0 | 1983 | 0.0 | ... | 0.016912 | 5112.0 | 1 | 16575.0 | 8342.13 | 99999.0 | 5999.0 | 12.0 | 8089.863295 | 138279069.0 |
27 | 27 | 2.0 | 10 | 20160331 | 1.0 | 0.0 | 15.0 | 31.0 | 4415 | 1.0 | ... | 0.027368 | 5226.0 | 1 | 13994.0 | 8076.76 | 92900.0 | 5200.0 | 15.0 | 8244.695287 | 113034210.0 |
28 | 28 | 5.0 | 10 | 20160314 | 0.0 | 1.0 | 10.0 | 121.0 | 129342 | 0.0 | ... | 0.023294 | 6159.0 | 5 | 13994.0 | 8076.76 | 92900.0 | 5200.0 | 15.0 | 8244.695287 | 113034210.0 |
29 | 29 | 0.0 | 7 | 20160311 | 0.0 | 0.0 | 12.5 | 7.0 | 4365 | 0.0 | ... | 0.106191 | 4538.0 | 2 | 2360.0 | 4195.64 | 38900.0 | 2600.0 | 60.0 | 4752.584154 | 9905909.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
199007 | 249970 | 2.0 | 22 | 20160309 | NaN | 0.0 | 3.0 | 95.0 | 20942 | - | ... | 0.089506 | 5054.0 | 4 | 1085.0 | 6025.25 | 43900.0 | 3950.0 | 50.0 | 5877.140886 | 6543426.0 |
199008 | 249971 | 0.0 | 10 | 20160320 | 0.0 | 1.0 | 15.0 | 17.0 | 81873 | 0.0 | ... | 0.008208 | 6799.0 | 3 | 13994.0 | 8076.76 | 92900.0 | 5200.0 | 15.0 | 8244.695287 | 113034210.0 |
199009 | 249972 | 2.0 | 6 | 20160331 | 0.0 | NaN | 12.5 | 46.0 | 32974 | 0.0 | ... | 0.047494 | 5928.0 | 2 | 10193.0 | 3576.37 | 35990.0 | 1800.0 | 13.0 | 4562.233331 | 36457518.0 |
199010 | 249973 | 6.0 | 26 | 20160403 | 0.0 | 1.0 | 12.5 | 1.0 | 46058 | 0.0 | ... | 0.012588 | 3012.0 | 3 | 878.0 | 8239.81 | 99999.0 | 5000.0 | 11.0 | 10282.987274 | 7242792.0 |
199011 | 249974 | 0.0 | 20 | 20160312 | 0.0 | 1.0 | 15.0 | 148.0 | 83094 | - | ... | 0.146453 | 5793.0 | 4 | 1235.0 | 3473.09 | 37800.0 | 1750.0 | 15.0 | 4400.529809 | 4292737.0 |
199012 | 249975 | 3.0 | 0 | 20160310 | 1.0 | 0.0 | 15.0 | 44.0 | 111949 | 0.0 | ... | 0.000000 | 5173.0 | 4 | 31429.0 | 5527.19 | 68500.0 | 3199.0 | 13.0 | 6261.371627 | 173719698.0 |
199013 | 249976 | 0.0 | 4 | 20160319 | 1.0 | 1.0 | 15.0 | 13.0 | 2352 | 0.0 | ... | 0.020835 | 4004.0 | 16575.0 | 8342.13 | 99999.0 | 5999.0 | 12.0 | 8089.863295 | 138279069.0 | |
199014 | 249977 | 0.0 | 14 | 20160328 | 0.0 | 0.0 | 15.0 | 51.0 | 57466 | 1.0 | ... | 0.101748 | 7078.0 | 4 | 16073.0 | 3053.17 | 38990.0 | 1700.0 | 12.0 | 3605.595127 | 49076652.0 |
199015 | 249978 | 1.0 | 0 | 20160403 | 0.0 | 0.0 | 15.0 | 29.0 | 15921 | 0.0 | ... | 0.070700 | 7421.0 | 2 | 31429.0 | 5527.19 | 68500.0 | 3199.0 | 13.0 | 6261.371627 | 173719698.0 |
199016 | 249979 | 1.0 | 0 | 20160316 | 0.0 | 0.0 | 7.0 | 91.0 | 59586 | - | ... | 0.112485 | 2812.0 | 2 | 31429.0 | 5527.19 | 68500.0 | 3199.0 | 13.0 | 6261.371627 | 173719698.0 |
199017 | 249980 | 4.0 | 0 | 20160324 | 0.0 | 0.0 | 4.0 | 0.0 | 2715 | 0.0 | ... | 0.053608 | 1719.0 | 1 | 31429.0 | 5527.19 | 68500.0 | 3199.0 | 13.0 | 6261.371627 | 173719698.0 |
199018 | 249981 | 4.0 | 0 | 20160330 | 0.0 | 1.0 | 5.0 | 0.0 | 164614 | 0.0 | ... | 0.059963 | 1692.0 | 4 | 31429.0 | 5527.19 | 68500.0 | 3199.0 | 13.0 | 6261.371627 | 173719698.0 |
199019 | 249982 | 0.0 | 10 | 20160331 | 0.0 | 1.0 | 15.0 | 109.0 | 23707 | 0.0 | ... | 0.000701 | 3489.0 | 1 | 13994.0 | 8076.76 | 92900.0 | 5200.0 | 15.0 | 8244.695287 | 113034210.0 |
199020 | 249983 | 0.0 | 0 | 20160403 | 0.0 | 0.0 | 6.0 | 0.0 | 2121 | 0.0 | ... | 0.057707 | 2036.0 | 5 | 31429.0 | 5527.19 | 68500.0 | 3199.0 | 13.0 | 6261.371627 | 173719698.0 |
199021 | 249984 | 1.0 | 9 | 20160402 | 0.0 | 0.0 | 15.0 | 10.0 | 154969 | 0.0 | ... | 0.115071 | 5260.0 | 1 | 7299.0 | 2439.08 | 68530.0 | 1400.0 | 50.0 | 2975.342884 | 17805271.0 |
199022 | 249985 | 1.0 | 20 | 20160323 | 0.0 | 0.0 | 8.0 | 71.0 | 90940 | 1.0 | ... | 0.098830 | 2881.0 | 1235.0 | 3473.09 | 37800.0 | 1750.0 | 15.0 | 4400.529809 | 4292737.0 | |
199023 | 249986 | 0.0 | 9 | 20160313 | 0.0 | 1.0 | 15.0 | 119.0 | 840 | 1.0 | ... | 0.093719 | 7092.0 | 7299.0 | 2439.08 | 68530.0 | 1400.0 | 50.0 | 2975.342884 | 17805271.0 | |
199024 | 249987 | 4.0 | 1 | 20160314 | 0.0 | 0.0 | 15.0 | 54.0 | 162062 | 0.0 | ... | 0.053211 | 7675.0 | 2 | 13656.0 | 9082.86 | 84000.0 | 6399.0 | 15.0 | 8988.865406 | 124044603.0 |
199025 | 249988 | 0.0 | 0 | 20160321 | 0.0 | 0.0 | 15.0 | 8.0 | 73332 | 0.0 | ... | 0.044568 | 3664.0 | 2 | 31429.0 | 5527.19 | 68500.0 | 3199.0 | 13.0 | 6261.371627 | 173719698.0 |
199026 | 249989 | 2.0 | 0 | 20160316 | 1.0 | 0.0 | 15.0 | 8.0 | 107648 | 0.0 | ... | 0.026760 | 4147.0 | 31429.0 | 5527.19 | 68500.0 | 3199.0 | 13.0 | 6261.371627 | 173719698.0 | |
199027 | 249990 | 3.0 | 5 | 20160331 | 0.0 | 0.0 | 15.0 | 19.0 | 61395 | - | ... | 0.051108 | 6051.0 | 3 | 4662.0 | 3305.67 | 31500.0 | 2300.0 | 20.0 | 3344.689763 | 15414322.0 |
199028 | 249991 | 2.0 | 10 | 20160403 | 0.0 | 1.0 | 15.0 | 17.0 | 72277 | 0.0 | ... | 0.002477 | 4532.0 | 13994.0 | 8076.76 | 92900.0 | 5200.0 | 15.0 | 8244.695287 | 113034210.0 | |
199029 | 249992 | 1.0 | 6 | 20160331 | 0.0 | 0.0 | 8.0 | 41.0 | 29738 | 0.0 | ... | 0.116640 | 3096.0 | 3 | 10193.0 | 3576.37 | 35990.0 | 1800.0 | 13.0 | 4562.233331 | 36457518.0 |
199030 | 249993 | 2.0 | 4 | 20160328 | 0.0 | 1.0 | 15.0 | 13.0 | 35 | 0.0 | ... | 0.018048 | 6834.0 | 3 | 16575.0 | 8342.13 | 99999.0 | 5999.0 | 12.0 | 8089.863295 | 138279069.0 |
199031 | 249994 | 0.0 | 4 | 20160330 | 0.0 | 0.0 | 15.0 | 4.0 | 41919 | 0.0 | ... | 0.024033 | 3888.0 | 5 | 16575.0 | 8342.13 | 99999.0 | 5999.0 | 12.0 | 8089.863295 | 138279069.0 |
199032 | 249995 | 0.0 | 4 | 20160309 | NaN | 1.0 | 15.0 | 4.0 | 111443 | - | ... | 0.039272 | 4173.0 | 5 | 16575.0 | 8342.13 | 99999.0 | 5999.0 | 12.0 | 8089.863295 | 138279069.0 |
199033 | 249996 | 0.0 | 1 | 20160323 | 0.0 | 0.0 | 4.0 | 65.0 | 152834 | 0.0 | ... | 0.067841 | 1079.0 | 5 | 13656.0 | 9082.86 | 84000.0 | 6399.0 | 15.0 | 8988.865406 | 124044603.0 |
199034 | 249997 | 0.0 | 4 | 20160316 | 0.0 | 1.0 | 12.5 | 4.0 | 132531 | 0.0 | ... | 0.042966 | 4113.0 | 3 | 16575.0 | 8342.13 | 99999.0 | 5999.0 | 12.0 | 8089.863295 | 138279069.0 |
199035 | 249998 | 4.0 | 1 | 20160327 | 0.0 | 1.0 | 15.0 | 40.0 | 143405 | 0.0 | ... | 0.009006 | 5017.0 | 13656.0 | 9082.86 | 84000.0 | 6399.0 | 15.0 | 8988.865406 | 124044603.0 | |
199036 | 249999 | 1.0 | 8 | 20160401 | 0.0 | 0.0 | 3.0 | 32.0 | 78202 | 0.0 | ... | 0.110180 | 2459.0 | 4 | 2070.0 | 4836.88 | 99999.0 | 2270.0 | 30.0 | 6053.233424 | 10017173.0 |
199037 rows × 41 columns
'''
以'power'为例进行分桶
'''
bin=[i*10 for i in range(31)]
data['power_bin']=pd.cut(data['power'],bin,labels=False)
data[['power_bin','power']].head() #这个分桶的结果没有看明白?
power_bin | power | |
---|---|---|
0 | 5.0 | 60 |
1 | NaN | 0 |
2 | 16.0 | 163 |
3 | 19.0 | 193 |
4 | 6.0 | 68 |
#删除不需要的数据
data=data.drop(['creatDate','regDate','regionCode'],axis=1)
print(data.shape)
data.columns
(199037, 39)
Index(['SaleID', 'bodyType', 'brand', 'fuelType', 'gearbox', 'kilometer',
'model', 'name', 'notRepairedDamage', 'offerType', 'power', 'price',
'seller', 'train', 'v_0', 'v_1', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14',
'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'used_time',
'city', 'brand_amount', 'brand_price_average', 'brand_price_max',
'brand_price_median', 'brand_price_min', 'brand_price_std',
'brand_price_sum', 'power_bin'],
dtype='object')
#目前的数据已经可以给树模型使用了,导出保存一下
data.to_csv(os.path.join(output_path,'data_for_tree.csv'),index=0)#注意:最后的index=0,不会增加新的索引
'''
我们可以再构造一份特征给LR/ NN之类的模型用
之所以分开构造,是因为不同模型对数据集的要求不同
让我们先看下数据分布:
'''
data['power'].plot.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x27fdf011cc0>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-GGmMGiib-1588387457101)(output_25_1.png)]
'''
我们刚刚已经对train进行异常值处理了,但是现在还有这么奇怪的分布时因为test中的Power异常值,
也可以看出,对于train中power的异常值不删除为好,可以用长尾分布阶段来代替。
'''
Train_data['power'].plot.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x27fdeeb96a0>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-JDTWNOEz-1588387457103)(output_26_1.png)]
#我们对其取log,再做归一化
from sklearn import preprocessing
min_max_scaler=preprocessing
data['power']=np.log(data['power']+1)
data['power']=((data['power']-np.min(data['power']))/(np.max(data['power'])-np.min(data['power'])))
data['power'].plot.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x27fce26c198>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-fQxQT8oC-1588387457106)(output_27_1.png)]
#再看KM,KM的比较正常,应该是已经做过分桶了
data['kilometer'].plot.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x27fce2fb748>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-vk7A063m-1588387457109)(output_28_1.png)]
#所以我们可以直接做归一化
data['kilometer']=((data['kilometer']-np.min(data['kilometer']))/
(np.max(data['kilometer'])-np.min(data['kilometer'])))
data['kilometer'].plot.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x27fcf607470>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-VqZtsr6Q-1588387457110)(output_29_1.png)]
'''
除此之外,还有我们刚刚构造的统计量特征:
'brand_amount', 'brand_price_average', 'brand_price_max',
'brand_price_median', 'brand_price_min', 'brand_price_std',
'brand_price_sum'
这里不在做分析,直接做变换
'''
def max_min(x):
return(x-np.min(x))/(np.max(x)-np.min(x))
data['brand_amount']=((data['brand_amount']-np.min(data['brand_amount']))/
np.max(data['brand_amount'])-np.min(data['brand_amount']))
data['brand_price_average'] = ((data['brand_price_average'] - np.min(data['brand_price_average'])) /
(np.max(data['brand_price_average']) - np.min(data['brand_price_average'])))
data['brand_price_max'] = ((data['brand_price_max'] - np.min(data['brand_price_max'])) /
(np.max(data['brand_price_max']) - np.min(data['brand_price_max'])))
data['brand_price_median'] = ((data['brand_price_median'] - np.min(data['brand_price_median'])) /
(np.max(data['brand_price_median']) - np.min(data['brand_price_median'])))
data['brand_price_min'] = ((data['brand_price_min'] - np.min(data['brand_price_min'])) /
(np.max(data['brand_price_min']) - np.min(data['brand_price_min'])))
data['brand_price_std'] = ((data['brand_price_std'] - np.min(data['brand_price_std'])) /
(np.max(data['brand_price_std']) - np.min(data['brand_price_std'])))
data['brand_price_sum'] = ((data['brand_price_sum'] - np.min(data['brand_price_sum'])) /
(np.max(data['brand_price_sum']) - np.min(data['brand_price_sum'])))
#对类别特征进行OneHOT-Encoder(这个和哑变量的区别是?)
data=pd.get_dummies(data,columns=['model','brand','bodyType','fuelType',
'gearbox','notRepairedDamage','power_bin'])
print(data.shape)
data.columns
(199037, 370)
Index(['SaleID', 'kilometer', 'name', 'offerType', 'power', 'price', 'seller',
'train', 'v_0', 'v_1',
...
'power_bin_20.0', 'power_bin_21.0', 'power_bin_22.0', 'power_bin_23.0',
'power_bin_24.0', 'power_bin_25.0', 'power_bin_26.0', 'power_bin_27.0',
'power_bin_28.0', 'power_bin_29.0'],
dtype='object', length=370)
#这份数据可以给LR用
data.to_csv(os.path.join(output_path,'data_for_lr.csv'),index=0)
3.3.3特征筛选
1)过滤式
#相关性分析
print(data['power'].corr(data['price'],method='spearman'))
print(data['kilometer'].corr(data['price'],method='spearman'))
print(data['brand_amount'].corr(data['price'],method='spearman'))
print(data['brand_price_average'].corr(data['price'],method='spearman'))
print(data['brand_price_max'].corr(data['price'],method='spearman'))
print(data['brand_price_median'].corr(data['price'],method='spearman'))
0.5728285196051496
-0.4082569701616764
-0.058156610025581514
0.3834909576057687
0.259066833880992
0.38691042393409447
#当然也可以直接看图
data_numeric=data[['power','kilometer','brand_amount',"brand_price_average",
'brand_price_max',"brand_price_median"]]
correlation=data_numeric.corr()
f,ax=plt.subplots(figsize=(7,7))
plt.title('Correlation of Numeric Features with Price',y=1,size=16)
sns.heatmap(correlation,square=True,vmax=0.8)
<matplotlib.axes._subplots.AxesSubplot at 0x27fd02a2a58>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-SwIJrBw4-1588387457113)(output_36_1.png)]
2)包裹式
#mlxtend下载速度很慢
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LinearRegression
sfs=SFS(LinearRegression(),
k_features=10,
forward=True,
floating=False,
scoring='r2',
cv=0)
x=data.drop(['price'],axis=1)
x=x.fillna(0)
y=data['price']
sfs.fit(x,y)
sfs.f_feature_names_
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
<ipython-input-66-50db1217b888> in <module>
----> 1 from mlxtend.feature_selection import SequentialFeatureSelector as SFS
2 from sklearn.linear_model import LinearRegression
3 sfs=SFS(LinearRegression(),
4 k_features=10,
5 forward=True,
ModuleNotFoundError: No module named 'mlxtend'
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib.pyplot as plt
fig1 = plot_sfs(sfs.get_metric_dict(), kind='std_dev')
plt.grid()
plt.show()
3)嵌入式
#下一章介绍,Lasso回归和决策树可以完成嵌入式特征选择
#大部分情况下都是用嵌入式做特征筛选