开发环境jupyter notebook
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings("ignore")
train = pd.read_csv("train.csv")
evaluate = pd.read_csv("test.csv")
evaluate.head()
evaluate.info()
1 数据类型 修改
clear_dtypes = train.drop('SalePrice', axis=1).dtypes
train['GarageQual'].value_counts()
train.dtypes['GarageQual']
train = train.replace({
"BsmtCond" : {"No": 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
"BsmtExposure" : {"No" : 0, "Mn" : 1, "Av": 2, "Gd" : 3},
"BsmtFinType1" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4,
"ALQ" : 5, "GLQ" : 6},
"BsmtFinType2" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4,
"ALQ" : 5, "GLQ" : 6},
"BsmtQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA": 3, "Gd" : 4, "Ex" : 5},
"ExterCond" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
"ExterQual" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
"FireplaceQu" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
"Functional" : {"Sal" : 1, "Sev" : 2, "Maj2" : 3, "Maj1" : 4, "Mod": 5,
"Min2" : 6, "Min1" : 7, "Typ" : 8},
"GarageCond" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
"GarageQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
"HeatingQC" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
"KitchenQual" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
"LandSlope" : {"Sev" : 1, "Mod" : 2, "Gtl" : 3},
"LotShape" : {"IR3" : 1, "IR2" : 2, "IR1" : 3, "Reg" : 4},
"PavedDrive" : {"N" : 0, "P" : 1, "Y" : 2},
"PoolQC" : {"No" : 0, "Fa" : 1, "TA" : 2, "Gd" : 3, "Ex" : 4},
"Street" : {"Grvl" : 1, "Pave" : 2},
"Utilities" : {"ELO" : 1, "NoSeWa" : 2, "NoSewr" : 3, "AllPub" : 4}
})
train['MSSubClass'].dtypes
train['YrSold'].dtypes
train['MoSold'].dtypes
train['MSSubClass'] = train['MSSubClass'].astype(str)
train['YearBuilt'] = train['YearBuilt'].astype(str)
train['YrSold'] = train['YrSold'].astype(str)
train['MoSold'] = train['MoSold'].astype(str)
2 重复样本处理— 删除重复样本
train.duplicated().sum()
train.drop_duplicates(inplace=True)
3 缺失值处理
3.1 行列–缺失值>40%删除处理
(train.isnull().sum(axis=1)/train.shape[1] > 0.4).sum()
nans_del_indexs = train.index[(train.isnull()\
.sum(axis=1)/train.shape[1] > 0.4)]
train.drop(labels=nans_del_indexs, axis=0, inplace=True)
def anova(train_cate_y, categorical, y):
anv = pd.DataFrame(index=categorical)
anv['feature'] = categorical
pvals = []
for c in categorical:
print(c)
samples = []
for cls in train_cate_y[c].dropna().unique():
s = train_cate_y[train_cate_y[c] == cls][y].values
samples.append(s)
pval = stats.f_oneway(*samples)[1]
pvals.append(pval)
anv['pval'] = pvals
return anv.sort_values('pval')
categorical = [column for column in train\
.columns if train.dtypes[column] == 'object']
y = 'SalePrice'
corr_cate = anova(train, categorical, y)
corr_cate['disparity'] = np.log(20*1./corr_cate['pval']\
.values)/np.log(20)
NAs = pd.concat([(100*train.isnull().sum(axis=0)/train.shape[0]),
100*evaluate.isnull().sum()/evaluate.shape[0]],
axis=1, keys=['Train', 'evaluate'])
NAs['type'] = train.dtypes[NAs.index]
NAs['corr_conti_y'] = train.corr()['SalePrice']
NAs['corr_cate_y'] = corr_cate['disparity']
NAs = NAs[NAs[['Train', 'evaluate']].sum(axis=1) > 0]\
.sort_values(by='Train', ascending=False)
NAs.head()
~
| Train | evaluate | type | corr_conti_y | corr_cate_y |
---|
PoolQC | 99.520548 | 99.794380 | float64 | 0.447936 | NaN |
MiscFeature | 96.301370 | 96.504455 | object | NaN | 1.753202 |
Alley | 93.767123 | 92.666210 | object | NaN | 6.618486 |
Fence | 80.753425 | 80.123372 | object | NaN | 3.026003 |
FireplaceQu | 47.260274 | 50.034270 | float64 | 0.295794 | NaN |
clear_nans_del_cols = NAs.index[((NAs[['Train', 'evaluate']]>40).sum(axis=1)>0).values]
clear_nans_del_cols
train.drop(labels=clear_nans_del_cols, axis=1, inplace=True)
3.2 统一填充 缺失率少于1%的列
clear_nans_less1_cols = NAs.index[((NAs[['Train', 'evaluate']]<1).sum(axis=1)==2).values].values
clear_nans_less1_cols_num = train[clear_nans_less1_cols].select_dtypes(include=[np.number]).columns
clear_nans_less1_cols_class = train[clear_nans_less1_cols].select_dtypes(include=[np.object]).columns
clear_nans_less1_cols_num_median = train[clear_nans_less1_cols_num].median()
clear_nans_less1_cols_class_mode = train[clear_nans_less1_cols_class].mode()
train[clear_nans_less1_cols_num] = train[clear_nans_less1_cols_num]\
.fillna(clear_nans_less1_cols_num_median)
train[clear_nans_less1_cols_class] = train[clear_nans_less1_cols_class]\
.fillna(clear_nans_less1_cols_class_mode)
3.3 相关性高的 连续型变量业务填充
"""
GarageYrBlt 车库建成日期 GarageYrBlt:0.055479,0.053425 float64 0.486362
车库的五个变量GarageType、GarageYrBlt、GarageFinish、GarageQual、GarageCond是由于房子没有车库而缺失。
GarageYrBlt 为车库的年份,我们用房子的建造年份来替代
MasVnrArea 外墙装饰材料的面积,Na即面积为0 MasVnrArea,0.005479,0.010274,float64,0.477493
LotFrontage 房子与街道的距离,Na即距离为0 LotFrontage,0.177397,0.155479,float64,0.351799
"""
train.loc[train['GarageYrBlt'].isnull(), 'GarageYrBlt'] = train.loc[train['GarageYrBlt']\
.isnull(), 'YearBuilt']
train['MasVnrArea'] = train['MasVnrArea'].fillna(0)
train['LotFrontage'] = train['LotFrontage'].fillna(0)
3.4 区分度高的分类型变量业务填充
"""
BsmtQual 地下室质量 BsmtQual,0.025342,0.030137,object,NaN,143.208288
BsmtExposure、BsmtFinType2、BsmtQual、BsmtCond、BsmtFinType1地下室变量,房子没有地下室而缺失。
KitchenQual: String, //厨房质量, 这里用众数替换
"""
clear_KitchenQual_mode = train['KitchenQual'].mode()
train['BsmtQual'].fillna('None', inplace=True)
train['KitchenQual'].fillna(clear_KitchenQual_mode, inplace=True)
3.5 统一填充剩余变量
remain_col = train.columns[train.isnull().sum(axis=0)>0]
NAs.loc[remain_col, ]
~
| Train | evaluate | type | corr_conti_y | corr_cate_y |
---|
MasVnrType | 0.547945 | 1.096642 | object | NaN | 50.437263 |
BsmtCond | 2.534247 | 3.084304 | float64 | 0.160658 | NaN |
BsmtExposure | 2.602740 | 3.015764 | float64 | 0.352958 | NaN |
BsmtFinType1 | 2.534247 | 2.878684 | float64 | 0.277436 | NaN |
BsmtFinType2 | 2.602740 | 2.878684 | float64 | -0.040139 | NaN |
Electrical | 0.068493 | 0.000000 | object | NaN | 14.665360 |
GarageType | 5.547945 | 5.209047 | object | NaN | 51.655312 |
GarageFinish | 5.547945 | 5.346127 | object | NaN | 72.421212 |
GarageQual | 5.547945 | 5.346127 | float64 | 0.156693 | NaN |
GarageCond | 5.547945 | 5.346127 | float64 | 0.125013 | NaN |
clear_nans_remain_cols_num = train[remain_col].select_dtypes(include=[np.number]).columns
clear_nans_remain_cols_class = train[remain_col].select_dtypes(include=[np.object]).columns
train[clear_nans_remain_cols_num] = train[clear_nans_remain_cols_num].fillna(0)
train[clear_nans_remain_cols_class] = train[clear_nans_remain_cols_class].fillna('None')
train.isnull().sum(axis=0).sum()
4 连续型变量奇异值处理
train.SalePrice.plot(kind='box',sym='b*')
train_number = train.select_dtypes(include = [np.number]).drop(['Id'], axis=1)
corr = train.corr()
corr_threshvalue = 0.5
corr_cols = corr.loc[:, corr.loc['SalePrice',:]\
.abs() > corr_threshvalue].columns
train_number[corr_cols].plot(sym='b*', kind='box', subplots=True, figsize=(16, 4))
clear_number_q = train_number.quantile(q=[0, 0.05, 0.25, 0.5, 0.75, 0.95, 1], axis=0)
lower_outlier_expand = 1.5
upper_outlier_expand = 3
clear_number_q.loc['lower_outlier',:] = clear_number_q.loc[0.25, :]\
- (clear_number_q.loc[0.75, :]\
- clear_number_q.loc[0.25, :])*lower_outlier_expand
clear_number_q.loc['upper_outlier',:] = clear_number_q.loc[0.75, :]\
+ (clear_number_q.loc[0.75, :]\
- clear_number_q.loc[0.25, :])*upper_outlier_expand
uppper_cnt = (train_number > clear_number_q.loc['upper_outlier', :]).sum()
uppper_cnt
lower_cnt = (train_number < clear_number_q.loc['lower_outlier', :]).sum()
lower_cnt
pd.DataFrame({'uppper_cnt': uppper_cnt,
'lower_cnt': lower_cnt,
'all': uppper_cnt+lower_cnt,
'all_rate': 100*(uppper_cnt+lower_cnt)/train.shape[0],
'corr': corr.loc['SalePrice',:]
}).sort_values(by='corr', ascending=False)
4.1 y变量处理–重点变量处理
train.ix[train['SalePrice'] > clear_number_q.loc['upper_outlier', 'SalePrice'], 'SalePrice'] = clear_number_q.loc['upper_outlier', 'SalePrice']
train.ix[train['SalePrice'] < clear_number_q.loc['lower_outlier', 'SalePrice'], 'SalePrice'] = clear_number_q.loc['lower_outlier', 'SalePrice']
var = 'GrLivArea'
data = pd.concat([train['SalePrice'], train[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000));
train.sort_values(by = 'GrLivArea', ascending = False)[:2]
train = train.drop(train[train['Id'] == 1299].index)
train = train.drop(train[train['Id'] == 524].index)
4.2 其余变量统一处理
clear_number_para = train_number.drop('SalePrice', axis=1).columns
train[clear_number_para] = train[clear_number_para]\
.where(train[clear_number_para] < clear_number_q.loc['upper_outlier', :],
clear_number_q.loc['upper_outlier', :], axis=1)
train[clear_number_para] = train[clear_number_para]\
.where(train[clear_number_para] > clear_number_q.loc['lower_outlier', :],
clear_number_q.loc['lower_outlier', :], axis=1)
train[corr_cols].plot(sym='b*', kind='box', subplots=True, figsize=(20, 8))