【Datawhale】[task3]3.3代码示例

3.3 代码示例

3.3.0导入数据

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from operator import itemgetter
#如果你用de是jupyter lab则没必要加%matplotlib inline
import os 

#结果保存路径
output_path='G:/newjourney/Datawhale/output'
if not os.path.exists(output_path):
    os.makedirs(output_path)
## 1)载入训练集和测试集
path='G:/newjourney/Datawhale/'
Train_data=pd.read_csv(path+'used_car_train_20200313.csv',sep=' ')
Test_data=pd.read_csv(path+'used_car_testB_20200421.csv',sep=' ')
print(Train_data.shape)
print(Test_data.shape)
(150000, 31)
(50000, 30)
Train_data.head().append(Train_data.tail())
SaleIDnameregDatemodelbrandbodyTypefuelTypegearboxpowerkilometer...v_5v_6v_7v_8v_9v_10v_11v_12v_13v_14
007362004040230.061.00.00.06012.5...0.2356760.1019880.1295490.0228160.097462-2.8818032.804097-2.4208210.7952920.914762
1122622003030140.012.00.00.0015.0...0.2647770.1210040.1357310.0265970.020582-4.9004822.096338-1.030483-1.7226740.245522
221487420040403115.0151.00.00.016312.5...0.2514100.1149120.1651470.0621730.027075-4.8467491.8035591.565330-0.832687-0.229963
337186519960908109.0100.00.01.019315.0...0.2742930.1103000.1219640.0333950.000000-4.5095991.285940-0.501868-2.438353-0.478699
4411108020120103110.051.00.00.0685.0...0.2280360.0732050.0918800.0788190.121534-1.8962400.9107830.9311102.8345181.923482
14999514999516397820000607121.0104.00.01.016315.0...0.2802640.0003100.0484410.0711580.0191741.988114-2.9839730.589167-1.304370-0.302592
14999614999618453520091102116.0110.00.00.012510.0...0.2532170.0007770.0840790.0996810.0793711.839166-2.7746152.5539940.924196-0.272160
1499971499971475872010100360.0111.01.00.0906.0...0.2333530.0007050.1188720.1001180.0979142.439812-1.6306772.2901971.8919220.414931
149998149998459072006031234.0103.01.00.015615.0...0.2563690.0002520.0814790.0835580.0814982.075380-2.6337191.4149370.431981-1.659014
1499991499991776721999020419.0286.00.01.019312.5...0.2844750.0000000.0400720.0625430.0258191.978453-3.1799130.031724-1.483350-0.342674

10 rows × 31 columns

Train_data.columns
Index(['SaleID', 'name', 'regDate', 'model', 'brand', 'bodyType', 'fuelType',
       'gearbox', 'power', 'kilometer', 'notRepairedDamage', 'regionCode',
       'seller', 'offerType', 'creatDate', 'price', 'v_0', 'v_1', 'v_2', 'v_3',
       'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12',
       'v_13', 'v_14'],
      dtype='object')

3.3.1 利用箱线图删除异常值

#这里是包装了一个异常值处理的代码
def outliers_proc(data,col_name,scale=3):
    '''
    用于清洗异常值,默认用box_plot(scale=3)进行清洗
    :param data:接收pandas数据格式
    :param col_name:pandas 列名
    :param scale:尺度
    :return:
    '''
    
    def box_plot_outliers(data_ser,box_scale):
        '''
        利用箱线图去除异常值
        :param data_ser:接收pandas.Series 数据格式
        :param box_scale:箱线图尺度
        :return:
        '''
        iqr=box_scale*(data_ser.quantile(0.75)-data_ser.quantile(0.25)) # 四分位距
        val_low=data_ser.quantile(0.25)-iqr# 最小值边界
        val_up=data_ser.quantile(0.75)+iqr # 最大值边界
        rule_low=(data_ser<val_low)
        rule_up=(data_ser>val_up)
        return (rule_low,rule_up),(val_low,val_up)
    data_n=data.copy()
    data_series=data_n[col_name]  # 取要处理的列的数据
    rule,value=box_plot_outliers(data_series,box_scale=scale)
    index=np.arange(data_series.shape[0])[rule[0]|rule[1]]  # 返回上下边界外数据的索引号
    print("Delete number is:{}".format(len(index))) # 输出删除条目数
    data_n=data_n.drop(index) #删除数据
    data_n.reset_index(drop=True,inplace=True) #恢复索引
    print("Now column number is:{}".format(data_n.shape[0])) # 输出删除后的条目数
    
    
    # 输出超出下边界异常值的数据特征
    index_low = np.arange(data_series.shape[0])[rule[0]]  # 超出下边界的异常值索引
    outliers = data_series.iloc[index_low]
    print("Description of data less than the lower bound is :")
    print(pd.Series(outliers).describe())
    # 输出超出上边界异常值的数据特征
    index_up=np.arange(data_series.shape[0])[rule[1]]
    outliers=data_series.iloc[index_up]
    print("Description of data larger than the upper bound is:")
    print(pd.Series(outliers).describe())
    
    fig,ax=plt.subplots(1,2,figsize=(10,7))
    sns.boxplot(y=data[col_name],data=data,palette="Set1",ax=ax[0])
    sns.boxplot(y=data_n[col_name],data=data_n,palette='Set1',ax=ax[1])
    return data_n

# 我们这里删掉一些异常数据,以power为例
#能不能删,自行判断
#但是要注意:test的数据不能删除
Train_data=outliers_proc(Train_data,'power',scale=3)
Delete number is:963
Now column number is:149037
Description of data less than the lower bound is :
count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: power, dtype: float64
Description of data larger than the upper bound is:
count      963.000000
mean       846.836968
std       1929.418081
min        376.000000
25%        400.000000
50%        436.000000
75%        514.000000
max      19312.000000
Name: power, dtype: float64

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-bscj7Ew8-1588387457094)(output_8_1.png)]

3.3.2特征构造

# 训练集和测试集利用concat放在一起,方便构造特征???
Train_data['train']=1
Test_data['train']=0#这是各自新增加了一列'train'吗??
data=pd.concat([Train_data,Test_data],ignore_index=True) # 忽略索引
G:\baidudownload2\anaconda\lib\site-packages\ipykernel_launcher.py:4: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  after removing the cwd from sys.path.
Train_data['train']
0         1
1         1
2         1
3         1
4         1
5         1
6         1
7         1
8         1
9         1
10        1
11        1
12        1
13        1
14        1
15        1
16        1
17        1
18        1
19        1
20        1
21        1
22        1
23        1
24        1
25        1
26        1
27        1
28        1
29        1
         ..
149007    1
149008    1
149009    1
149010    1
149011    1
149012    1
149013    1
149014    1
149015    1
149016    1
149017    1
149018    1
149019    1
149020    1
149021    1
149022    1
149023    1
149024    1
149025    1
149026    1
149027    1
149028    1
149029    1
149030    1
149031    1
149032    1
149033    1
149034    1
149035    1
149036    1
Name: train, Length: 149037, dtype: int64
#使用时间:data['creatDate']-data['regDate'],反应汽车使用时间,一般来说价格与使用时间成反比
#需要注意:数据里有时间出错的格式,所以我们需要errors=’coerce‘
data['used_time'] = (pd.to_datetime(data['creatDate'], format='%Y%m%d', errors='coerce') -\
                     pd.to_datetime(data['regDate'], format='%Y%m%d', errors='coerce')).dt.days
'''
看一下空数据,有15K个样本是有问题的,我们可以选择删除,也可以选择放着(因为XGBoost之类的决策树,其本身就能处理缺失值)。
但这里不建议删除,因为删除缺失数据占总样本量过大,7.5%
'''
data['used_time'].isnull().sum()
15054
#从邮编中提取城市信息,相当于加入了先验知识
data['city']=data['regionCode'].apply(lambda x:str(x)[:-3])
data=data
data.columns
Index(['SaleID', 'bodyType', 'brand', 'creatDate', 'fuelType', 'gearbox',
       'kilometer', 'model', 'name', 'notRepairedDamage', 'offerType', 'power',
       'price', 'regDate', 'regionCode', 'seller', 'train', 'v_0', 'v_1',
       'v_10', 'v_11', 'v_12', 'v_13', 'v_14', 'v_2', 'v_3', 'v_4', 'v_5',
       'v_6', 'v_7', 'v_8', 'v_9', 'used_time', 'city'],
      dtype='object')
#计算某品牌的销售统计量,当然也可计算其他特征的统计量
#这里以训练集的数据计算统计量
Train_gb=Train_data.groupby('brand')
all_info={}
for kind,kind_data in Train_gb:
    info={}
    kind_data=kind_data[kind_data['price']>0]
    info['brand_amount']=len(kind_data)
    info['brand_price_max']=kind_data.price.max()
    info['brand_price_median']=kind_data.price.median()
    info['brand_price_min']=kind_data.price.min()
    info['brand_price_sum']=kind_data.price.sum()
    info['brand_price_std']=kind_data.price.std()
    info['brand_price_average']=round(kind_data.price.sum()/(len(kind_data)+1),2)
    all_info[kind]=info
brand_fe=pd.DataFrame(all_info).T.reset_index().rename(columns={'index':'brand'})
#这个一步到位可以学习一下,转置-->重置索引-->重新命名
data=data.merge(brand_fe,how='left',on='brand')
    
    
    
kind_data
SaleIDnameregDatemodelbrandbodyTypefuelTypegearboxpowerkilometer...v_6v_7v_8v_9v_10v_11v_12v_13v_14train
511651443838719950009244.039NaN0.00.000.5...0.1279471.1261370.0699920.0465781.71770418.1976995.4529851.532134-3.6209421
19963200793838720150402244.0396.00.00.0264.0...0.1281170.0000000.0603310.184668-6.2144280.697001-0.2499553.577009-2.3067801
49501498031818101996110819.0392.00.00.0396.0...0.0000000.0768050.0485030.0352793.142073-1.259462-1.508440-0.897785-0.6598891
6512465532507781991070719.0394.0NaNNaN02.0...0.0004990.0376320.0909400.0360461.606401-3.4367821.914129-0.3745110.1171961
757757625865608200811031.0395.0NaNNaN608.0...0.0002330.0376640.0743470.0723662.118526-2.7837460.3929180.4584081.0056711
9032590920172758200407101.0391.01.00.06015.0...0.0000000.1562220.0576220.0735523.187338-0.808931-0.3503790.0754030.7309901
983869904922825199812031.0390.00.00.04515.0...0.0000000.0864920.0282760.0645953.399385-1.059497-2.862146-0.7582360.0006581
987659943159082199900021.0390.0NaN0.0015.0...0.0000000.0714550.0492270.0455012.968157-1.548087-1.400021-0.6747030.2584031
1262371270713838719950004244.039NaN0.00.000.5...0.1279471.1261370.0699920.0465781.71770418.1976995.4529851.532134-3.6209421

9 rows × 32 columns

all_info[kind]
{'brand_amount': 9,
 'brand_price_max': 14500,
 'brand_price_median': 1900.0,
 'brand_price_min': 750,
 'brand_price_sum': 39480,
 'brand_price_std': 5520.867232600327,
 'brand_price_average': 3948.0}
brand_fe
brandbrand_amountbrand_price_averagebrand_price_maxbrand_price_medianbrand_price_minbrand_price_stdbrand_price_sum
0031429.05527.1968500.03199.013.06261.371627173719698.0
1113656.09082.8684000.06399.015.08988.865406124044603.0
22318.011806.4055800.07500.035.010576.2244443766241.0
332461.06480.1937500.04990.065.05396.32750315954226.0
4416575.08342.1399999.05999.012.08089.863295138279069.0
554662.03305.6731500.02300.020.03344.68976315414322.0
6610193.03576.3735990.01800.013.04562.23333136457518.0
772360.04195.6438900.02600.060.04752.5841549905909.0
882070.04836.8899999.02270.030.06053.23342410017173.0
997299.02439.0868530.01400.050.02975.34288417805271.0
101013994.08076.7692900.05200.015.08244.695287113034210.0
11112944.04549.4134500.02900.030.04722.16049213398006.0
12121108.04052.5727490.02625.050.04066.9599504494303.0
13133813.02799.1135000.01600.020.03073.91519610675790.0
141416073.03053.1738990.01700.012.03605.59512749076652.0
15151458.09851.8345000.08500.0100.05425.05814014373814.0
16162219.03638.9017900.02999.020.02450.9060898078352.0
1717913.03641.8855800.02200.015.03952.9133303328679.0
1818315.04807.1234599.01999.050.06358.4097611519049.0
19191386.05211.4542350.02800.020.06186.5389497228288.0
20201235.03473.0937800.01750.015.04400.5298094292737.0
21211546.05724.9435999.04225.050.05257.2350268856481.0
22221085.06025.2543900.03950.050.05877.1408866543426.0
2323183.03245.2864000.01200.099.07333.695140597132.0
2424630.032365.7399999.027450.015.019855.49520120422776.0
25252059.03648.3222500.02500.025.03556.2498397515546.0
2626878.08239.8199999.05000.011.010282.9872747242792.0
27272049.05298.8162900.04200.035.04853.28924010862559.0
2828633.05321.7039900.03790.080.04509.0363013373957.0
2929406.06041.8419990.05250.0500.03639.7377222459028.0
3030940.04186.1323200.03295.050.03659.5772913939145.0
3131318.01755.9711000.01000.050.01829.079211560155.0
3232588.04006.9533500.02350.050.04394.5960022360095.0
3333201.09107.9365000.05600.0980.09637.1353231839801.0
3434227.01016.562900.0999.060.0554.118445231776.0
3535180.01646.2828900.0950.050.03325.933365297977.0
3636228.03563.3220900.02250.0150.03922.715389816001.0
3737331.016180.2586500.013250.0550.013541.1803155371844.0
383865.03266.978999.02850.099.02140.083145215620.0
39399.03948.0014500.01900.0750.05520.86723339480.0
data
SaleIDbodyTypebrandcreatDatefuelTypegearboxkilometermodelnamenotRepairedDamage...v_9used_timecitybrand_amountbrand_price_averagebrand_price_maxbrand_price_medianbrand_price_minbrand_price_stdbrand_price_sum
001.06201604040.00.012.530.07360.0...0.0974624385.0110193.03576.3735990.01800.013.04562.23333136457518.0
112.01201603090.00.015.040.02262-...0.0205824757.0413656.09082.8684000.06399.015.08988.865406124044603.0
221.015201604020.00.012.5115.0148740.0...0.0270754382.021458.09851.8345000.08500.0100.05425.05814014373814.0
330.010201603120.01.015.0109.0718650.0...0.0000007125.013994.08076.7692900.05200.015.08244.695287113034210.0
441.05201603130.00.05.0110.01110800.0...0.1215341531.064662.03305.6731500.02300.020.03344.68976315414322.0
550.010201603191.00.010.024.01376420.0...0.0487692482.0313994.08076.7692900.05200.015.08244.695287113034210.0
660.04201603170.01.015.013.024020.0...0.0281746185.0316575.08342.1399999.05999.012.08089.863295138279069.0
771.014201603260.00.015.026.01653460.0...0.0824136108.0416073.03053.1738990.01700.012.03605.59512749076652.0
882.01201603261.01.015.019.029740.0...0.0243884798.0413656.09082.8684000.06399.015.08988.865406124044603.0
995.07201604020.00.015.07.0820210.0...0.0987276666.02360.04195.6438900.02600.060.04752.5841549905909.0
10103.09201603201.00.015.019.0189610.0...0.0607943874.017299.02439.0868530.01400.050.02975.34288417805271.0
11112.06201603260.00.02.01.0744950.0...0.0300192936.0510193.03576.3735990.01800.013.04562.23333136457518.0
12121.014201603210.00.06.048.01201030.0...0.0742505493.0216073.03053.1738990.01700.012.03605.59512749076652.0
13130.01201603260.00.015.065.081291.0...0.0513594154.0313656.09082.8684000.06399.015.08988.865406124044603.0
1414NaN020160402NaN0.015.01.01896-...0.020187NaN331429.05527.1968500.03199.013.06261.371627173719698.0
15156.027201603310.00.015.0138.0845460.0...0.0751086260.022049.05298.8162900.04200.035.04853.28924010862559.0
16160.01201603061.01.012.5105.0100360.0...0.0886951638.013656.09082.8684000.06399.015.08988.865406124044603.0
17176.021201604040.00.015.0114.0297560.0...0.0645523896.01546.05724.9435999.04225.050.05257.2350268856481.0
18181.014201603130.00.015.048.0104088-...0.0762036363.0116073.03053.1738990.01700.012.03605.59512749076652.0
19190.00201603280.00.015.00.0157380.0...0.0354645866.031429.05527.1968500.03199.013.06261.371627173719698.0
20201.014201603110.00.015.048.01486690.0...0.062781NaN16073.03053.1738990.01700.012.03605.59512749076652.0
21210.0020160403NaNNaN15.08.012784-...0.0507134925.0131429.05527.1968500.03199.013.06261.371627173719698.0
22221.014201603260.00.015.026.0131637-...0.093230NaN516073.03053.1738990.01700.012.03605.59512749076652.0
23235.07201603170.00.015.078.089491.0...0.0940008021.012360.04195.6438900.02600.060.04752.5841549905909.0
24241.016201603271.01.05.021.0248220.0...0.0725641603.02219.03638.9017900.02999.020.02450.9060898078352.0
25252.03201603111.00.015.03.0128770.0...0.0953503712.02461.06480.1937500.04990.065.05396.32750315954226.0
26260.04201604011.00.015.04.019830.0...0.0169125112.0116575.08342.1399999.05999.012.08089.863295138279069.0
27272.010201603311.00.015.031.044151.0...0.0273685226.0113994.08076.7692900.05200.015.08244.695287113034210.0
28285.010201603140.01.010.0121.01293420.0...0.0232946159.0513994.08076.7692900.05200.015.08244.695287113034210.0
29290.07201603110.00.012.57.043650.0...0.1061914538.022360.04195.6438900.02600.060.04752.5841549905909.0
..................................................................
1990072499702.02220160309NaN0.03.095.020942-...0.0895065054.041085.06025.2543900.03950.050.05877.1408866543426.0
1990082499710.010201603200.01.015.017.0818730.0...0.0082086799.0313994.08076.7692900.05200.015.08244.695287113034210.0
1990092499722.06201603310.0NaN12.546.0329740.0...0.0474945928.0210193.03576.3735990.01800.013.04562.23333136457518.0
1990102499736.026201604030.01.012.51.0460580.0...0.0125883012.03878.08239.8199999.05000.011.010282.9872747242792.0
1990112499740.020201603120.01.015.0148.083094-...0.1464535793.041235.03473.0937800.01750.015.04400.5298094292737.0
1990122499753.00201603101.00.015.044.01119490.0...0.0000005173.0431429.05527.1968500.03199.013.06261.371627173719698.0
1990132499760.04201603191.01.015.013.023520.0...0.0208354004.016575.08342.1399999.05999.012.08089.863295138279069.0
1990142499770.014201603280.00.015.051.0574661.0...0.1017487078.0416073.03053.1738990.01700.012.03605.59512749076652.0
1990152499781.00201604030.00.015.029.0159210.0...0.0707007421.0231429.05527.1968500.03199.013.06261.371627173719698.0
1990162499791.00201603160.00.07.091.059586-...0.1124852812.0231429.05527.1968500.03199.013.06261.371627173719698.0
1990172499804.00201603240.00.04.00.027150.0...0.0536081719.0131429.05527.1968500.03199.013.06261.371627173719698.0
1990182499814.00201603300.01.05.00.01646140.0...0.0599631692.0431429.05527.1968500.03199.013.06261.371627173719698.0
1990192499820.010201603310.01.015.0109.0237070.0...0.0007013489.0113994.08076.7692900.05200.015.08244.695287113034210.0
1990202499830.00201604030.00.06.00.021210.0...0.0577072036.0531429.05527.1968500.03199.013.06261.371627173719698.0
1990212499841.09201604020.00.015.010.01549690.0...0.1150715260.017299.02439.0868530.01400.050.02975.34288417805271.0
1990222499851.020201603230.00.08.071.0909401.0...0.0988302881.01235.03473.0937800.01750.015.04400.5298094292737.0
1990232499860.09201603130.01.015.0119.08401.0...0.0937197092.07299.02439.0868530.01400.050.02975.34288417805271.0
1990242499874.01201603140.00.015.054.01620620.0...0.0532117675.0213656.09082.8684000.06399.015.08988.865406124044603.0
1990252499880.00201603210.00.015.08.0733320.0...0.0445683664.0231429.05527.1968500.03199.013.06261.371627173719698.0
1990262499892.00201603161.00.015.08.01076480.0...0.0267604147.031429.05527.1968500.03199.013.06261.371627173719698.0
1990272499903.05201603310.00.015.019.061395-...0.0511086051.034662.03305.6731500.02300.020.03344.68976315414322.0
1990282499912.010201604030.01.015.017.0722770.0...0.0024774532.013994.08076.7692900.05200.015.08244.695287113034210.0
1990292499921.06201603310.00.08.041.0297380.0...0.1166403096.0310193.03576.3735990.01800.013.04562.23333136457518.0
1990302499932.04201603280.01.015.013.0350.0...0.0180486834.0316575.08342.1399999.05999.012.08089.863295138279069.0
1990312499940.04201603300.00.015.04.0419190.0...0.0240333888.0516575.08342.1399999.05999.012.08089.863295138279069.0
1990322499950.0420160309NaN1.015.04.0111443-...0.0392724173.0516575.08342.1399999.05999.012.08089.863295138279069.0
1990332499960.01201603230.00.04.065.01528340.0...0.0678411079.0513656.09082.8684000.06399.015.08988.865406124044603.0
1990342499970.04201603160.01.012.54.01325310.0...0.0429664113.0316575.08342.1399999.05999.012.08089.863295138279069.0
1990352499984.01201603270.01.015.040.01434050.0...0.0090065017.013656.09082.8684000.06399.015.08988.865406124044603.0
1990362499991.08201604010.00.03.032.0782020.0...0.1101802459.042070.04836.8899999.02270.030.06053.23342410017173.0

199037 rows × 41 columns

'''
以'power'为例进行分桶

'''
bin=[i*10 for i in range(31)]
data['power_bin']=pd.cut(data['power'],bin,labels=False)
data[['power_bin','power']].head()   #这个分桶的结果没有看明白?
power_binpower
05.060
1NaN0
216.0163
319.0193
46.068
#删除不需要的数据
data=data.drop(['creatDate','regDate','regionCode'],axis=1)
print(data.shape)
data.columns
(199037, 39)





Index(['SaleID', 'bodyType', 'brand', 'fuelType', 'gearbox', 'kilometer',
       'model', 'name', 'notRepairedDamage', 'offerType', 'power', 'price',
       'seller', 'train', 'v_0', 'v_1', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14',
       'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'used_time',
       'city', 'brand_amount', 'brand_price_average', 'brand_price_max',
       'brand_price_median', 'brand_price_min', 'brand_price_std',
       'brand_price_sum', 'power_bin'],
      dtype='object')
#目前的数据已经可以给树模型使用了,导出保存一下
data.to_csv(os.path.join(output_path,'data_for_tree.csv'),index=0)#注意:最后的index=0,不会增加新的索引
'''
我们可以再构造一份特征给LR/ NN之类的模型用
之所以分开构造,是因为不同模型对数据集的要求不同
让我们先看下数据分布:
'''
data['power'].plot.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x27fdf011cc0>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-GGmMGiib-1588387457101)(output_25_1.png)]

'''
我们刚刚已经对train进行异常值处理了,但是现在还有这么奇怪的分布时因为test中的Power异常值,
也可以看出,对于train中power的异常值不删除为好,可以用长尾分布阶段来代替。
'''
Train_data['power'].plot.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x27fdeeb96a0>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-JDTWNOEz-1588387457103)(output_26_1.png)]

#我们对其取log,再做归一化
from sklearn import preprocessing
min_max_scaler=preprocessing
data['power']=np.log(data['power']+1)
data['power']=((data['power']-np.min(data['power']))/(np.max(data['power'])-np.min(data['power'])))
data['power'].plot.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x27fce26c198>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-fQxQT8oC-1588387457106)(output_27_1.png)]

#再看KM,KM的比较正常,应该是已经做过分桶了
data['kilometer'].plot.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x27fce2fb748>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-vk7A063m-1588387457109)(output_28_1.png)]

#所以我们可以直接做归一化
data['kilometer']=((data['kilometer']-np.min(data['kilometer']))/
                  (np.max(data['kilometer'])-np.min(data['kilometer'])))
data['kilometer'].plot.hist()
<matplotlib.axes._subplots.AxesSubplot at 0x27fcf607470>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-VqZtsr6Q-1588387457110)(output_29_1.png)]

'''
除此之外,还有我们刚刚构造的统计量特征:
'brand_amount', 'brand_price_average', 'brand_price_max',
       'brand_price_median', 'brand_price_min', 'brand_price_std',
       'brand_price_sum'
这里不在做分析,直接做变换
'''
def max_min(x):
    return(x-np.min(x))/(np.max(x)-np.min(x))
data['brand_amount']=((data['brand_amount']-np.min(data['brand_amount']))/
                     np.max(data['brand_amount'])-np.min(data['brand_amount']))
data['brand_price_average'] = ((data['brand_price_average'] - np.min(data['brand_price_average'])) / 
                               (np.max(data['brand_price_average']) - np.min(data['brand_price_average'])))
data['brand_price_max'] = ((data['brand_price_max'] - np.min(data['brand_price_max'])) / 
                           (np.max(data['brand_price_max']) - np.min(data['brand_price_max'])))
data['brand_price_median'] = ((data['brand_price_median'] - np.min(data['brand_price_median'])) /
                              (np.max(data['brand_price_median']) - np.min(data['brand_price_median'])))
data['brand_price_min'] = ((data['brand_price_min'] - np.min(data['brand_price_min'])) / 
                           (np.max(data['brand_price_min']) - np.min(data['brand_price_min'])))
data['brand_price_std'] = ((data['brand_price_std'] - np.min(data['brand_price_std'])) / 
                           (np.max(data['brand_price_std']) - np.min(data['brand_price_std'])))
data['brand_price_sum'] = ((data['brand_price_sum'] - np.min(data['brand_price_sum'])) / 
                           (np.max(data['brand_price_sum']) - np.min(data['brand_price_sum'])))
#对类别特征进行OneHOT-Encoder(这个和哑变量的区别是?)
data=pd.get_dummies(data,columns=['model','brand','bodyType','fuelType',
                                 'gearbox','notRepairedDamage','power_bin'])
print(data.shape)
data.columns
(199037, 370)





Index(['SaleID', 'kilometer', 'name', 'offerType', 'power', 'price', 'seller',
       'train', 'v_0', 'v_1',
       ...
       'power_bin_20.0', 'power_bin_21.0', 'power_bin_22.0', 'power_bin_23.0',
       'power_bin_24.0', 'power_bin_25.0', 'power_bin_26.0', 'power_bin_27.0',
       'power_bin_28.0', 'power_bin_29.0'],
      dtype='object', length=370)
#这份数据可以给LR用
data.to_csv(os.path.join(output_path,'data_for_lr.csv'),index=0)

3.3.3特征筛选

1)过滤式

#相关性分析
print(data['power'].corr(data['price'],method='spearman'))
print(data['kilometer'].corr(data['price'],method='spearman'))
print(data['brand_amount'].corr(data['price'],method='spearman'))
print(data['brand_price_average'].corr(data['price'],method='spearman'))
print(data['brand_price_max'].corr(data['price'],method='spearman'))
print(data['brand_price_median'].corr(data['price'],method='spearman'))
0.5728285196051496
-0.4082569701616764
-0.058156610025581514
0.3834909576057687
0.259066833880992
0.38691042393409447
#当然也可以直接看图
data_numeric=data[['power','kilometer','brand_amount',"brand_price_average",
                  'brand_price_max',"brand_price_median"]]
correlation=data_numeric.corr()
f,ax=plt.subplots(figsize=(7,7))
plt.title('Correlation of Numeric Features with Price',y=1,size=16)
sns.heatmap(correlation,square=True,vmax=0.8)
<matplotlib.axes._subplots.AxesSubplot at 0x27fd02a2a58>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-SwIJrBw4-1588387457113)(output_36_1.png)]

2)包裹式

#mlxtend下载速度很慢
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LinearRegression
sfs=SFS(LinearRegression(),
       k_features=10,
       forward=True,
       floating=False,
       scoring='r2',
       cv=0)
x=data.drop(['price'],axis=1)
x=x.fillna(0)
y=data['price']
sfs.fit(x,y)
sfs.f_feature_names_
---------------------------------------------------------------------------

ModuleNotFoundError                       Traceback (most recent call last)

<ipython-input-66-50db1217b888> in <module>
----> 1 from mlxtend.feature_selection import SequentialFeatureSelector as SFS
      2 from sklearn.linear_model import LinearRegression
      3 sfs=SFS(LinearRegression(),
      4        k_features=10,
      5        forward=True,


ModuleNotFoundError: No module named 'mlxtend'
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib.pyplot as plt
fig1 = plot_sfs(sfs.get_metric_dict(), kind='std_dev')
plt.grid()
plt.show()

3)嵌入式

#下一章介绍,Lasso回归和决策树可以完成嵌入式特征选择
#大部分情况下都是用嵌入式做特征筛选

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值