机器学习实战02:Kaggle - House Price Prediction Top 4%

一、模块导入

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
warnings.filterwarnings('ignore')
%matplotlib inline
plt.style.use('ggplot')
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline, make_pipeline
from scipy.stats import skew
from sklearn.decomposition import PCA, KernelPCA
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import ElasticNet, SGDRegressor, BayesianRidge
from sklearn.kernel_ridge import KernelRidge
from xgboost import XGBRegressor

二、数据读取

pd.set_option('max_colwidth',300)  # 显示最大列长度(字符)
pd.set_option('display.width',300) # 横向最多显示的字符数
pd.set_option('display.max_columns',500) # 显示的最大列数
pd.set_option('display.max_rows',1000) # 显示的最大行数

# 数据读取
os.getcwd()
os.chdir('C:/Users/Anqi/00 Mechine learning/Kaggle 2_House Price Prediction/House_Data')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

三、简单可视化

# 根据经验,房屋建造年份、地面住房面积与售卖价格密切相关
# 下面将这两项可视化
plt.figure(figsize=(15,8))
sns.boxplot(train.YearBuilt, train.SalePrice)

在这里插入图片描述

plt.figure(figsize=(12,6))
plt.scatter(x=train.GrLivArea, y=train.SalePrice)
plt.xlabel("GrLivArea", fontsize=15)
plt.ylabel("SalePrice", fontsize=15)
plt.ylim(0,800000)

在这里插入图片描述

# 去除异常值
train.drop(train[(train["GrLivArea"]>4000)&(train["SalePrice"]<300000)].index,inplace=True)
combined_train_test=pd.concat([train,test], ignore_index=True)
combined_train_test.drop(['Id'],axis=1, inplace=True)
combined_train_test.shape

[Out] (2917, 80)

四、数据清洗、缺失值填充

missing_data = combined_train_test.isnull().sum()
missing_data[missing_data>0].sort_values(ascending=False)

在这里插入图片描述
缺失值过大的特征可能会影响最终模型的准确性,因此将缺失值超过500的特征舍弃

4.1 LotFrontage 缺失值填充

# LotFrontage 街道特征与邻居(周边居住环境)相关性大,并且 LotFrontage为离散型数据,
# 因此将 LotFrontage划分区间并结合Neighborhood进行分组,而后使用中位数进行填充
combined_train_test['LotAreaCut'] = pd.qcut(combined_train_test.LotArea,10)
combined_train_test['LotFrontage']= combined_train_test.groupby(['LotAreaCut','Neighborhood'])['LotFrontage'].transform(lambda x: x.fillna(x.median()))
# 由于部分 LotFrontage 第一步填充操作后,在同组内(同区间同邻居)无对应的中位数值,皆缺失
# 可按照 LotFrontage 各区间的中位数再次进行缺失值填充
#Since some combinations of LotArea and Neighborhood are not available, so we just LotAreaCut alone.
combined_train_test['LotFrontage']=combined_train_test.groupby(['LotAreaCut'])['LotFrontage'].transform(lambda x: x.fillna(x.median()))

4.2 其他缺失值填充

# 可按 0 值进行填充的特征:
cols=["MasVnrArea", "BsmtUnfSF", "TotalBsmtSF", "GarageCars", "BsmtFinSF2", "BsmtFinSF1", "GarageArea"]
for col in cols:
    combined_train_test[col].fillna(0, inplace=True)

# 可按 None 进行填充的特征:
cols1 = ["PoolQC" , "MiscFeature", "Alley", "Fence", "FireplaceQu", "GarageQual", "GarageCond", "GarageFinish", "GarageYrBlt", "GarageType", "BsmtExposure", "BsmtCond", "BsmtQual", "BsmtFinType2", "BsmtFinType1", "MasVnrType"]
for col in cols1:
    combined_train_test[col].fillna("None", inplace=True)

# 可按众数进行填充的特征:
cols2 = ["MSZoning", "BsmtFullBath", "BsmtHalfBath", "Utilities", "Functional", "Electrical", "KitchenQual", "SaleType","Exterior1st", "Exterior2nd"]
for col in cols2:
    combined_train_test[col].fillna(combined_train_test[col].mode()[0], inplace=True)

# 检查样本中的缺失值状态
combined_train_test.isnull().sum()[combined_train_test.isnull().sum()>0]

五、特征工程处理

5.1 分类型特征处理

# step 1:将数值型特征转化为字符型
NumStr = ["MSSubClass","BsmtFullBath","BsmtHalfBath","HalfBath","BedroomAbvGr","KitchenAbvGr","MoSold","YrSold","YearBuilt","YearRemodAdd","LowQualFinSF","GarageYrBlt"]
for col in NumStr:
    combined_train_test[col]=combined_train_test[col].astype(str)

# step 2:挨个处理各分类型特征(根据平均数、中位数、计数,将各分类型特征可视化,并根据结果进一步分类)
fig = plt.figure(figsize=(15,18))
# MSSubClass
plt.subplot(421)
v_MSSC = combined_train_test.groupby(['MSSubClass'])[['SalePrice']].agg(['mean','median','count'])
plt.plot(v_MSSC.index,v_MSSC['SalePrice']['mean'],label='mean',color='red')
plt.plot(v_MSSC.index,v_MSSC['SalePrice']['median'],label='median',color='blue')
plt.plot(v_MSSC.index,v_MSSC['SalePrice']['count']*1000,label='count*1000',color='green')
plt.title('MSSubClass')
plt.legend(loc='upper right')

# BsmtFullBath
plt.subplot(422)
v_BFB = combined_train_test.groupby(['BsmtFullBath'])[['SalePrice']].agg(['mean','median','count'])
plt.plot(v_BFB.index,v_BFB['SalePrice']['mean'],label='mean',color='red')
plt.plot(v_BFB.index,v_BFB['SalePrice']['median'],label='median',color='blue')
plt.plot(v_BFB.index,v_BFB['SalePrice']['count']*200,label='count*200',color='green')
plt.title('BsmtFullBath')
plt.legend(loc='upper right')

# BsmtHalfBath
plt.subplot(423)
v_BHB = combined_train_test.groupby(['BsmtHalfBath'])[['SalePrice']].agg(['mean','median','count'])
plt.plot(v_BHB.index,v_BHB['SalePrice']['mean'],label='mean',color='red')
plt.plot(v_BHB.index,v_BHB['SalePrice']['median'],label='median',color='blue')
plt.plot(v_BHB.index,v_BHB['SalePrice']['count']*200,label='count*200',color='green')
plt.title('BsmtHalfBath')
plt.legend(loc='upper right')

# HalfBath
plt.subplot(424)
v_HB = combined_train_test.groupby(['HalfBath'])[['SalePrice']].agg(['mean','median','count'])
plt.plot(v_HB.index,v_HB['SalePrice']['mean'],label='mean',color='red')
plt.plot(v_HB.index,v_HB['SalePrice']['median'],label='median',color='blue')
plt.plot(v_HB.index,v_HB['SalePrice']['count']*200,label='count*200',color='green')
plt.title('HalfBath')
plt.legend(loc='upper right')

# BedroomAbvGr
plt.subplot(425)
v_BAG = combined_train_test.groupby(['BedroomAbvGr'])[['SalePrice']].agg(['mean','median','count'])
plt.plot(v_BAG.index,v_BAG['SalePrice']['mean'],lab
  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值