房租赛-数据处理

import pandas as pd
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
def parseData(df):
    """
    预处理数据
    """
    df['rentType'][df['rentType']=='--'] = '未知方式'
    # 转换object类型数据
    columns = ['rentType', 'houseFloor', 'houseToward', 'houseDecoration', 'communityName', 'region', 'plate']
    for col in columns:
        df[col] = df[col].astype('category')
        

    
    # 处理pv和uv的空值
    df['pv'].fillna(df['pv'].mean(),inplace=True)
    df['uv'].fillna(df['uv'].mean(),inplace=True)
    df['pv'] = df['pv'].astype('int')
    df['uv'] = df['uv'].astype('int')
    #df.loc[df['buildYear']=='暂无信息','buildYear'] = None
    # 将buildYear列转换为整型数据

    tmp = df['buildYear'].copy()
    tmp2 = tmp[tmp!='暂无信息'].astype('int')
    tmp[tmp=='暂无信息'] = tmp2.mode().iloc[0]
    df['buildYear'] = tmp
    df['buildYear'] = df['buildYear'].astype('int')
    # 去掉部分特征,房屋朝向直接剔除
    #df.drop('communityName',axis=1, inplace=True)
    df.drop('city',axis=1,inplace=True)
    df.drop('houseToward',axis=1,inplace=True)
    df.drop('houseDecoration',axis=1,inplace=True)
    df.drop['ID',axis=1,inplace=True]
    
    return df
def washData(df_train, df_test):
    """
    清洗数据
    """
    #测试集里面面积只存在200以下,为了训练集与测试集相符只选区面积200以下的进行训练
    df_train = df_train[(df_train['area']<=200)&(df_train['area']>6)]
    df_train = df_train[df_train['tradeMoney']<=100000]
    df_train = df_train.drop(df_train[(df_train.tradeMoney/df_train.area>300)]
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 4
    评论
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值