import pandas as pd
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
def parseData(df):
"""
预处理数据
"""
df['rentType'][df['rentType']=='--'] = '未知方式'
# 转换object类型数据
columns = ['rentType', 'houseFloor', 'houseToward', 'houseDecoration', 'communityName', 'region', 'plate']
for col in columns:
df[col] = df[col].astype('category')
# 处理pv和uv的空值
df['pv'].fillna(df['pv'].mean(),inplace=True)
df['uv'].fillna(df['uv'].mean(),inplace=True)
df['pv'] = df['pv'].astype('int')
df['uv'] = df['uv'].astype('int')
#df.loc[df['buildYear']=='暂无信息','buildYear'] = None
# 将buildYear列转换为整型数据
tmp = df['buildYear'].copy()
tmp2 = tmp[tmp!='暂无信息'].astype('int')
tmp[tmp=='暂无信息'] = tmp2.mode().iloc[0]
df['buildYear'] = tmp
df['buildYear'] = df['buildYear'].astype('int')
# 去掉部分特征,房屋朝向直接剔除
#df.drop('communityName',axis=1, inplace=True)
df.drop('city',axis=1,inplace=True)
df.drop('houseToward',axis=1,inplace=True)
df.drop('houseDecoration',axis=1,inplace=True)
df.drop['ID',axis=1,inplace=True]
return df
def washData(df_train, df_test):
"""
清洗数据
"""
#测试集里面面积只存在200以下,为了训练集与测试集相符只选区面积200以下的进行训练
df_train = df_train[(df_train['area']<=200)&(df_train['area']>6)]
df_train = df_train[df_train['tradeMoney']<=100000]
df_train = df_train.drop(df_train[(df_train.tradeMoney/df_train.area>300)]
房租赛-数据处理
最新推荐文章于 2022-10-24 09:27:16 发布