DataWale竞赛学习-2_DataCleaning

导入相关的包

#载入数据
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as mp
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import IsolationForest

导入数据集

train_data=pd.read_csv("./train_data.csv")
train_data["Type"]="Train"
test_data=pd.read_csv("./test_a.csv")
test_data["Type"]="Test"
all_data=pd.concat([train_data,test_data],ignore_index=True)
all_data.shape

输出:

缺失值、异常值、object类型分析及处理

'''根据task1的缺失值分析发现
    1.pv和uv两个特征存在空值,因为是数值型可以使用均值填充
    2.rentType存在异常值"一",可以用众数"未知方式"替换
    3.将buildYear的"暂无信息"用众数替换,并将其从object类型转换成int类型
    4.将tradeTime的日期做年、月分离,生成新的特征,并删除tradeTime特征
    5.对其他object类型数据进行标签编码LabelEncoder
'''
def fillnaData(data):
    #1.pv和uv两个特征存在空值,因为是数值型可以使用均值填充
    data["pv"].fillna(data["pv"].mean(),inplace=True)
    data["uv"].fillna(data["uv"].mean(),inplace=True)
    data['pv'] = data['pv'].astype('int')
    data['uv'] = data['uv'].astype('int')
    #2.rentType存在异常值"一",可以用"未知方式"替换
    data["rentType"][data["rentType"]=="--"]="未知方式"
    
    #3.将buildYear的"暂无信息"用众数替换,并将其从object类型转换成int类型
    buildYearmean=data[data["buildYear"]!="暂无信息"]["buildYear"].mode()
    print(buildYearmean)
    data["buildYear"][data["buildYear"]=="暂无信息"]=buildYearmean[0]
    data["buildYear"]=data["buildYear"].astype("int")
    
    #4.将tradeTime的日期做年、月分离,生成新的特征,并删除tradeTime特征
    def getYear(tradeTime):
        return int(tradeTime.split("/")[0])
    def getMonth(tradeTime):
        return int(tradeTime.split("/")[1])
    data["tradeYear"]=data["tradeTime"].apply(lambda x:getYear(x))
    data["tradeMonth"]=data["tradeTime"].apply(lambda x:getMonth(x))
    
    #5.对其他object类型数据进行标签编码LabelEncoder
    encode_features=['rentType', 'houseType', 'houseFloor', 'houseToward', 'houseDecoration', 'communityName','region', 'plate']
    for feature in encode_features:
        data[feature]=LabelEncoder().fit_transform(data[feature])

#查看object数据
fillnaData(train_data)
for feature in ['rentType', 'houseType', 'houseFloor', 'houseToward', 'houseDecoration', 'communityName',
                'region', 'plate', 'buildYear', 'tradeTime', 'Type',"tradeYear","tradeMonth"]:
    print(feature,"特征分布如下:")
    print(train_data[feature].value_counts())

输出:

删除冗余或对目标函数无贡献的特征

remove_features=["ID","city","tradeTime"]
def dropFeature(data):
    for feature in remove_features:
        data.drop(feature,axis=1,inplace=True)
    return data
train_data=dropFeature(train_data)

使用IsolationForest检测并删除异常点

def exceptValueDetect(data):
    IForest=IsolationForest(contamination=0.01)
    IForest.fit(data["tradeMoney"].values.reshape(-1,1))
    pred_y=IForest.predict(data["tradeMoney"].values.reshape(-1,1))
    print(data[pred_y==-1].index)
    print(data.shape)
    data.drop(data[pred_y==-1].index,inplace=True)
    return data
train_data=exceptValueDetect(train_data)
print(train_data.shape)
    

输出:

手动删除异常值

def dropExceptValues(data):
    #根据箱线图,对area和tradeMoney的取值做限制,剔除离群数据
    data=data[(data["area"]<=200)&(data["tradeMoney"]>=700)&(data["tradeMoney"]<=16000)] 
    #剔除totalFloor为0的数据
    drop_index=data[data["totalFloor"]==0].index
    print(f"drop index list:{drop_index}")
    print(data.shape)
    data.drop(drop_index,inplace=True)
    print(data.shape)
    return data
train_data=dropExceptValues(train_data)

输出:

画面积和租金的箱线图

mp.figure("areabox",figsize=(15,5))
mp.title("areabox",fontsize=20)
print(train_data.area)
sns.boxplot(train_data["area"])
mp.show()
mp.figure(figsize=(15,5))
mp.title("tradeMoneybox",fontsize=20)
print(train_data.tradeMoney)
sns.boxplot(train_data["tradeMoney"])
mp.show()

输出:

深度清洗

 def cleanData(data):
    data.drop(data[(data['region']=='RG00001') & (data['tradeMoney']<1000)&(data['area']>50)].index,inplace=True)
    data.drop(data[(data['region']=='RG00001') & (data['tradeMoney']>25000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00001') & (data['area']>250)&(data['tradeMoney']<20000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00001') & (data['area']>400)&(data['tradeMoney']>50000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00001') & (data['area']>100)&(data['tradeMoney']<2000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00002') & (data['area']<100)&(data['tradeMoney']>60000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00003') & (data['area']<300)&(data['tradeMoney']>30000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00003') & (data['tradeMoney']<500)&(data['area']<50)].index,inplace=True)
    data.drop(data[(data['region']=='RG00003') & (data['tradeMoney']<1500)&(data['area']>100)].index,inplace=True)
    data.drop(data[(data['region']=='RG00003') & (data['tradeMoney']<2000)&(data['area']>300)].index,inplace=True)
    data.drop(data[(data['region']=='RG00003') & (data['tradeMoney']>5000)&(data['area']<20)].index,inplace=True)
    data.drop(data[(data['region']=='RG00003') & (data['area']>600)&(data['tradeMoney']>40000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00004') & (data['tradeMoney']<1000)&(data['area']>80)].index,inplace=True)
    data.drop(data[(data['region']=='RG00006') & (data['tradeMoney']<200)].index,inplace=True)
    data.drop(data[(data['region']=='RG00005') & (data['tradeMoney']<2000)&(data['area']>180)].index,inplace=True)
    data.drop(data[(data['region']=='RG00005') & (data['tradeMoney']>50000)&(data['area']<200)].index,inplace=True)
    data.drop(data[(data['region']=='RG00006') & (data['area']>200)&(data['tradeMoney']<2000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00007') & (data['area']>100)&(data['tradeMoney']<2500)].index,inplace=True)
    data.drop(data[(data['region']=='RG00010') & (data['area']>200)&(data['tradeMoney']>25000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00010') & (data['area']>400)&(data['tradeMoney']<15000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00010') & (data['tradeMoney']<3000)&(data['area']>200)].index,inplace=True)
    data.drop(data[(data['region']=='RG00010') & (data['tradeMoney']>7000)&(data['area']<75)].index,inplace=True)
    data.drop(data[(data['region']=='RG00010') & (data['tradeMoney']>12500)&(data['area']<100)].index,inplace=True)
    data.drop(data[(data['region']=='RG00004') & (data['area']>400)&(data['tradeMoney']>20000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00008') & (data['tradeMoney']<2000)&(data['area']>80)].index,inplace=True)
    data.drop(data[(data['region']=='RG00009') & (data['tradeMoney']>40000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00009') & (data['area']>300)].index,inplace=True)
    data.drop(data[(data['region']=='RG00009') & (data['area']>100)&(data['tradeMoney']<2000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00011') & (data['tradeMoney']<10000)&(data['area']>390)].index,inplace=True)
    data.drop(data[(data['region']=='RG00012') & (data['area']>120)&(data['tradeMoney']<5000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00013') & (data['area']<100)&(data['tradeMoney']>40000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00013') & (data['area']>400)&(data['tradeMoney']>50000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00013') & (data['area']>80)&(data['tradeMoney']<2000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00014') & (data['area']>300)&(data['tradeMoney']>40000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00014') & (data['tradeMoney']<1300)&(data['area']>80)].index,inplace=True)
    data.drop(data[(data['region']=='RG00014') & (data['tradeMoney']<8000)&(data['area']>200)].index,inplace=True)
    data.drop(data[(data['region']=='RG00014') & (data['tradeMoney']<1000)&(data['area']>20)].index,inplace=True)
    data.drop(data[(data['region']=='RG00014') & (data['tradeMoney']>25000)&(data['area']>200)].index,inplace=True)
    data.drop(data[(data['region']=='RG00014') & (data['tradeMoney']<20000)&(data['area']>250)].index,inplace=True)
    data.drop(data[(data['region']=='RG00005') & (data['tradeMoney']>30000)&(data['area']<100)].index,inplace=True)
    data.drop(data[(data['region']=='RG00005') & (data['tradeMoney']<50000)&(data['area']>600)].index,inplace=True)
    data.drop(data[(data['region']=='RG00005') & (data['tradeMoney']>50000)&(data['area']>350)].index,inplace=True)
    data.drop(data[(data['region']=='RG00006') & (data['tradeMoney']>4000)&(data['area']<100)].index,inplace=True)
    data.drop(data[(data['region']=='RG00006') & (data['tradeMoney']<600)&(data['area']>100)].index,inplace=True)
    data.drop(data[(data['region']=='RG00006') & (data['area']>165)].index,inplace=True)
    data.drop(data[(data['region']=='RG00012') & (data['tradeMoney']<800)&(data['area']<30)].index,inplace=True)
    data.drop(data[(data['region']=='RG00007') & (data['tradeMoney']<1100)&(data['area']>50)].index,inplace=True)
    data.drop(data[(data['region']=='RG00004') & (data['tradeMoney']>8000)&(data['area']<80)].index,inplace=True)
    data.loc[(data['region']=='RG00002')&(data['area']>50)&(data['rentType']=='合租'),'rentType']='整租'
    data.loc[(data['region']=='RG00014')&(data['rentType']=='合租')&(data['area']>60),'rentType']='整租'
    data.drop(data[(data['region']=='RG00008')&(data['tradeMoney']>15000)&(data['area']<110)].index,inplace=True)
    data.drop(data[(data['region']=='RG00008')&(data['tradeMoney']>20000)&(data['area']>110)].index,inplace=True)
    data.drop(data[(data['region']=='RG00008')&(data['tradeMoney']<1500)&(data['area']<50)].index,inplace=True)
    data.drop(data[(data['region']=='RG00008')&(data['rentType']=='合租')&(data['area']>50)].index,inplace=True)
    data.drop(data[(data['region']=='RG00015') ].index,inplace=True)
    data.reset_index(drop=True, inplace=True)
    return data

train_data = cleanData(train_data)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值