基于Kaggle的经典AI项目二—数据清洗

开发环境jupyter notebook
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
from scipy import stats

import matplotlib.pyplot as plt
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"          

import warnings
warnings.filterwarnings("ignore")                        #忽略警告

train = pd.read_csv("train.csv")
evaluate = pd.read_csv("test.csv")
evaluate.head()                                          #查看
evaluate.info()                                          #查看数据信息 
1 数据类型 修改
clear_dtypes = train.drop('SalePrice', axis=1).dtypes    #修改数据类型
train['GarageQual'].value_counts()
train.dtypes['GarageQual'] 

# 有序型变量转换 
train = train.replace({    
                       "BsmtCond" : {"No": 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "BsmtExposure" : {"No" : 0, "Mn" : 1, "Av": 2, "Gd" : 3},
                       "BsmtFinType1" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4,
                                         "ALQ" : 5, "GLQ" : 6},
                       "BsmtFinType2" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4,
                                         "ALQ" : 5, "GLQ" : 6},
                       "BsmtQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA": 3, "Gd" : 4, "Ex" : 5},
                       "ExterCond" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
                       "ExterQual" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
                       "FireplaceQu" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "Functional" : {"Sal" : 1, "Sev" : 2, "Maj2" : 3, "Maj1" : 4, "Mod": 5,
                                       "Min2" : 6, "Min1" : 7, "Typ" : 8},
                       "GarageCond" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "GarageQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "HeatingQC" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "KitchenQual" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "LandSlope" : {"Sev" : 1, "Mod" : 2, "Gtl" : 3},
                       "LotShape" : {"IR3" : 1, "IR2" : 2, "IR1" : 3, "Reg" : 4},
                       "PavedDrive" : {"N" : 0, "P" : 1, "Y" : 2},
                       "PoolQC" : {"No" : 0, "Fa" : 1, "TA" : 2, "Gd" : 3, "Ex" : 4},
                       "Street" : {"Grvl" : 1, "Pave" : 2},
                       "Utilities" : {"ELO" : 1, "NoSeWa" : 2, "NoSewr" : 3, "AllPub" : 4}
                      })  

# 查看数据类型并修改
train['MSSubClass'].dtypes                              # MSSubClass: 出售的房屋类型:分类型
train['YrSold'].dtypes                                  # 房子的出售年月
train['MoSold'].dtypes 

train['MSSubClass'] = train['MSSubClass'].astype(str)   # 修改数据类型
train['YearBuilt'] = train['YearBuilt'].astype(str)
train['YrSold'] = train['YrSold'].astype(str)
train['MoSold'] = train['MoSold'].astype(str)
2 重复样本处理— 删除重复样本
train.duplicated().sum()                                # 统计重复样本个数 
train.drop_duplicates(inplace=True)                     # 删除重复样本 
3 缺失值处理
3.1 行列–缺失值>40%删除处理
# 行缺失处理--删除缺失率>40%的行

(train.isnull().sum(axis=1)/train.shape[1] > 0.4).sum() # 统计缺失率>40%的行数;train.shape[1]:所有列76列

nans_del_indexs = train.index[(train.isnull()\          # 删除40%的数据都缺失的行
                       .sum(axis=1)/train.shape[1] > 0.4)]
train.drop(labels=nans_del_indexs, axis=0, inplace=True)
# 列缺失处理 

def anova(train_cate_y, categorical, y):                        # 类别性变量区分度计算
    anv = pd.DataFrame(index=categorical)
    anv['feature'] = categorical
    pvals = []
    for c in categorical:
        print(c)
        samples = []
        for cls in train_cate_y[c].dropna().unique():           # 删除每一列数据 Na 值,去重
            s = train_cate_y[train_cate_y[c] == cls][y].values  # 获取处理好列的 SalePrice值
            samples.append(s)                                   # 某特征下不同取值对应的房价组合形成二维列表
        pval = stats.f_oneway(*samples)[1]                      # 一元方差分析得到F,P:P越小,对方差的影响越大。
        pvals.append(pval)
    anv['pval'] = pvals
    return anv.sort_values('pval')

categorical = [column for column in train\
               .columns if train.dtypes[column] == 'object']    # 类型变量集合
y = 'SalePrice'
corr_cate = anova(train, categorical, y)
corr_cate['disparity'] = np.log(20*1./corr_cate['pval']\
                                .values)/np.log(20)             # 区分度 

# 统计每个列缺失值比例:
NAs = pd.concat([(100*train.isnull().sum(axis=0)/train.shape[0]),
                 100*evaluate.isnull().sum()/evaluate.shape[0]], 
                 axis=1, keys=['Train', 'evaluate'])
NAs['type'] = train.dtypes[NAs.index]

NAs['corr_conti_y'] = train.corr()['SalePrice']                 # 加入连续性变量相关度
NAs['corr_cate_y'] = corr_cate['disparity']                     # 加入类别性变量区分度

NAs = NAs[NAs[['Train', 'evaluate']].sum(axis=1) > 0]\          # 只保留有缺失值的列
      .sort_values(by='Train', ascending=False)
NAs.head()
  ~  Trainevaluatetypecorr_conti_ycorr_cate_y
PoolQC99.52054899.794380float640.447936NaN
MiscFeature96.30137096.504455objectNaN1.753202
Alley93.76712392.666210objectNaN6.618486
Fence80.75342580.123372objectNaN3.026003
FireplaceQu47.26027450.034270float640.295794NaN
# 删除缺失率超过40%的列 

clear_nans_del_cols = NAs.index[((NAs[['Train', 'evaluate']]>40).sum(axis=1)>0).values]
clear_nans_del_cols 

train.drop(labels=clear_nans_del_cols, axis=1, inplace=True)
3.2 统一填充 缺失率少于1%的列
  • 连续型用中位数
  • 分类型用众数
# 查找 数值型 类别型列

clear_nans_less1_cols = NAs.index[((NAs[['Train', 'evaluate']]<1).sum(axis=1)==2).values].values
clear_nans_less1_cols_num = train[clear_nans_less1_cols].select_dtypes(include=[np.number]).columns
clear_nans_less1_cols_class = train[clear_nans_less1_cols].select_dtypes(include=[np.object]).columns 

# 生成中位数、众数待用
clear_nans_less1_cols_num_median = train[clear_nans_less1_cols_num].median()
clear_nans_less1_cols_class_mode = train[clear_nans_less1_cols_class].mode() 

#  连续型用 中位数填充
#  分类型用 众数填充
train[clear_nans_less1_cols_num] = train[clear_nans_less1_cols_num]\
                                   .fillna(clear_nans_less1_cols_num_median)
train[clear_nans_less1_cols_class] = train[clear_nans_less1_cols_class]\
                                   .fillna(clear_nans_less1_cols_class_mode)
3.3 相关性高的 连续型变量业务填充
""" 
	GarageYrBlt 车库建成日期 GarageYrBlt:0.055479,0.053425  float64	0.486362
	车库的五个变量GarageType、GarageYrBlt、GarageFinish、GarageQual、GarageCond是由于房子没有车库而缺失。
	GarageYrBlt 为车库的年份,我们用房子的建造年份来替代 

	MasVnrArea 外墙装饰材料的面积,Na即面积为0 MasVnrArea,0.005479,0.010274,float64,0.477493
	LotFrontage 房子与街道的距离,Na即距离为0 LotFrontage,0.177397,0.155479,float64,0.351799
"""
train.loc[train['GarageYrBlt'].isnull(), 'GarageYrBlt'] = train.loc[train['GarageYrBlt']\
                                                                    .isnull(), 'YearBuilt']

train['MasVnrArea'] = train['MasVnrArea'].fillna(0)
train['LotFrontage'] = train['LotFrontage'].fillna(0)
3.4 区分度高的分类型变量业务填充
"""
	BsmtQual 地下室质量  BsmtQual,0.025342,0.030137,object,NaN,143.208288
	BsmtExposure、BsmtFinType2、BsmtQual、BsmtCond、BsmtFinType1地下室变量,房子没有地下室而缺失。 
	KitchenQual: String,  //厨房质量, 这里用众数替换
""" 

clear_KitchenQual_mode = train['KitchenQual'].mode()               #获取众数

train['BsmtQual'].fillna('None', inplace=True)
train['KitchenQual'].fillna(clear_KitchenQual_mode, inplace=True)  #使用众数填充
3.5 统一填充剩余变量
  • 连续型用 0
  • 分类型用 None
# 查看剩余变量情况
remain_col = train.columns[train.isnull().sum(axis=0)>0]
NAs.loc[remain_col, ]
  ~  Trainevaluatetypecorr_conti_ycorr_cate_y
MasVnrType0.5479451.096642objectNaN50.437263
BsmtCond2.5342473.084304float640.160658NaN
BsmtExposure2.6027403.015764float640.352958NaN
BsmtFinType12.5342472.878684float640.277436NaN
BsmtFinType22.6027402.878684float64-0.040139NaN
Electrical0.0684930.000000objectNaN14.665360
GarageType5.5479455.209047objectNaN51.655312
GarageFinish5.5479455.346127objectNaN72.421212
GarageQual5.5479455.346127float640.156693NaN
GarageCond5.5479455.346127float640.125013NaN
# 查找:数值型列;类别型列对其进行 0、None 填充
clear_nans_remain_cols_num = train[remain_col].select_dtypes(include=[np.number]).columns
clear_nans_remain_cols_class = train[remain_col].select_dtypes(include=[np.object]).columns

train[clear_nans_remain_cols_num] = train[clear_nans_remain_cols_num].fillna(0)
train[clear_nans_remain_cols_class] = train[clear_nans_remain_cols_class].fillna('None') 

# 查看缺失值是否处理完毕
train.isnull().sum(axis=0).sum()
4 连续型变量奇异值处理
train.SalePrice.plot(kind='box',sym='b*')                # y变量箱线图 

# 查看与y变量 高相关性的变量的箱线图
train_number = train.select_dtypes(include = [np.number]).drop(['Id'], axis=1) 

corr = train.corr()
corr_threshvalue = 0.5                                   #number of variables for heatmap 
corr_cols = corr.loc[:, corr.loc['SalePrice',:]\
                     .abs() > corr_threshvalue].columns  #查找房价大于阈值的列 

train_number[corr_cols].plot(sym='b*', kind='box', subplots=True, figsize=(16, 4)) 

这里写图片描述

# 找到异常值上下临界点
clear_number_q = train_number.quantile(q=[0, 0.05, 0.25, 0.5, 0.75, 0.95, 1], axis=0) 

# outlier_expand(离群拓展)为异常值定义伸缩参数,标准为1.5
lower_outlier_expand = 1.5          
upper_outlier_expand = 3
clear_number_q.loc['lower_outlier',:] = clear_number_q.loc[0.25, :]\
                                        - (clear_number_q.loc[0.75, :]\
                                        - clear_number_q.loc[0.25, :])*lower_outlier_expand
clear_number_q.loc['upper_outlier',:] = clear_number_q.loc[0.75, :]\
                                        + (clear_number_q.loc[0.75, :]\
                                        - clear_number_q.loc[0.25, :])*upper_outlier_expand 

# 统计各个列 高于上临界点的数量
uppper_cnt = (train_number > clear_number_q.loc['upper_outlier', :]).sum()
uppper_cnt 

# 统计各个列 低于下临界点的数量
lower_cnt = (train_number < clear_number_q.loc['lower_outlier', :]).sum()
lower_cnt 

# 汇总各个列的奇异值数量
pd.DataFrame({'uppper_cnt': uppper_cnt, 
              'lower_cnt': lower_cnt, 
              'all': uppper_cnt+lower_cnt,  
              'all_rate': 100*(uppper_cnt+lower_cnt)/train.shape[0],
              'corr': corr.loc['SalePrice',:]
             }).sort_values(by='corr', ascending=False)
4.1 y变量处理–重点变量处理
# 获取超出阈值(上临界,下临界)的SalePrice
train.ix[train['SalePrice'] > clear_number_q.loc['upper_outlier', 'SalePrice'], 'SalePrice'] = clear_number_q.loc['upper_outlier', 'SalePrice']
train.ix[train['SalePrice'] < clear_number_q.loc['lower_outlier', 'SalePrice'], 'SalePrice'] = clear_number_q.loc['lower_outlier', 'SalePrice'] 

# 双变量分析SaleSt/ GraveStand——重点变量
var = 'GrLivArea'
data = pd.concat([train['SalePrice'], train[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000)); 

#deleting points 删除点
train.sort_values(by = 'GrLivArea', ascending = False)[:2] 

#删除 ID=1299/524 的行索引
train = train.drop(train[train['Id'] == 1299].index)
train = train.drop(train[train['Id'] == 524].index)
4.2 其余变量统一处理
clear_number_para = train_number.drop('SalePrice', axis=1).columns 

train[clear_number_para] = train[clear_number_para]\
                           .where(train[clear_number_para] < clear_number_q.loc['upper_outlier', :], 
                                  clear_number_q.loc['upper_outlier', :], axis=1)

train[clear_number_para] = train[clear_number_para]\
                           .where(train[clear_number_para] > clear_number_q.loc['lower_outlier', :], 
                                  clear_number_q.loc['lower_outlier', :], axis=1) 

train[corr_cols].plot(sym='b*', kind='box', subplots=True, figsize=(20, 8))

这里写图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

SongpingWang

你的鼓励是我创作的最大动力!

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值