import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from operator import itemgetter
%matplotlib inline
train = pd.read_csv('train.csv', sep=' ')
test = pd.read_csv('testA.csv', sep=' ')print(train.shape)print(test.shape)
defoutliers_proc(data, col_name, scale=3):"""
用于清洗异常值,默认用 box_plot(scale=3)进行清洗
:param data: 接收 pandas 数据格式
:param col_name: pandas 列名
:param scale: 尺度
:return:
"""defbox_plot_outliers(data_ser, box_scale):"""
利用箱线图去除异常值
:param data_ser: 接收 pandas.Series 数据格式
:param box_scale: 箱线图尺度,
:return:
"""
iqr = box_scale *(data_ser.quantile(0.75)- data_ser.quantile(0.25))
val_low = data_ser.quantile(0.25)- iqr
val_up = data_ser.quantile(0.75)+ iqr
rule_low =(data_ser < val_low)
rule_up =(data_ser > val_up)return(rule_low, rule_up),(val_low, val_up)
data_n = data.copy()
data_series = data_n[col_name]
rule, value = box_plot_outliers(data_series, box_scale=scale)
index = np.arange(data_series.shape[0])[rule[0]| rule[1]]print("Delete number is: {}".format(len(index)))
data_n = data_n.drop(index)
data_n.reset_index(drop=True, inplace=True)print("Now column number is: {}".format(data_n.shape[0]))
index_low = np.arange(data_series.shape[0])[rule[0]]
outliers = data_series.iloc[index_low]print("Description of data less than the lower bound is:")print(pd.Series(outliers).describe())
index_up = np.arange(data_series.shape[0])[rule[1]]
outliers = data_series.iloc[index_up]print("Description of data larger than the upper bound is:")print(pd.Series(outliers).describe())
fig, ax = plt.subplots(1,2, figsize=(10,7))
sns.boxplot(y=data[col_name], data=data, palette="Set1", ax=ax[0])
sns.boxplot(y=data_n[col_name], data=data_n, palette="Set1", ax=ax[1])return data_n
# 我们可以删掉一些异常数据,以 power 为例。 # 这里删不删同学可以自行判断# 但是要注意 test 的数据不能删 = = 不能掩耳盗铃是不是
train = outliers_proc(train,'power', scale=3)
Delete number is: 963
Now column number is: 149037
Description of data less than the lower bound is:
count 0.0
mean NaN
std NaN
min NaN
25% NaN
50% NaN
75% NaN
max NaN
Name: power, dtype: float64
Description of data larger than the upper bound is:
count 963.000000
mean 846.836968
std 1929.418081
min 376.000000
25% 400.000000
50% 436.000000
75% 514.000000
max 19312.000000
Name: power, dtype: float64
特征构造
# 训练集和测试集放在一起,方便构造特征
train['train']=1
test['train']=0
data = pd.concat([train, test], ignore_index=True, sort=False)
# 当然也可以直接看图
data_numeric = data[['power','kilometer','brand_amount','brand_price_average','brand_price_max','brand_price_median']]
correlation = data_numeric.corr()
f , ax = plt.subplots(figsize =(7,7))
plt.title('Correlation of Numeric Features with Price',y=1,size=16)
sns.heatmap(correlation,square =True, vmax=0.8)
<matplotlib.axes._subplots.AxesSubplot at 0x129059470>
2) 包裹式
!pip install mlxtend
# k_feature 太大会很难跑,没服务器,所以提前 interrupt 了from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LinearRegression
sfs = SFS(LinearRegression(),
k_features=10,
forward=True,
floating=False,
scoring ='r2',
cv =0)
x = data.drop(['price'], axis=1)
x = x.fillna(0)
y = data['price']
sfs.fit(x, y)
sfs.k_feature_names_
STOPPING EARLY DUE TO KEYBOARD INTERRUPT...
('powerPS_ten',
'city',
'brand_price_std',
'vehicleType_andere',
'model_145',
'model_601',
'fuelType_andere',
'notRepairedDamage_ja')
# 画出来,可以看到边际效益from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib.pyplot as plt
fig1 = plot_sfs(sfs.get_metric_dict(), kind='std_dev')
plt.grid()
plt.show()
/Users/chenze/anaconda3/lib/python3.7/site-packages/numpy/core/_methods.py:140: RuntimeWarning: Degrees of freedom <= 0 for slice
keepdims=keepdims)
/Users/chenze/anaconda3/lib/python3.7/site-packages/numpy/core/_methods.py:132: RuntimeWarning: invalid value encountered in double_scalars
ret = ret.dtype.type(ret / rcount)