kaggel数据分析竞赛--房价预测

这是一个通过机器学习模型预测房价的一个项目,希望可以从中了解大概的数据挖掘流程。

最终通过上传结果排名前8%

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
train=pd.read_csv(r'D:\workplace\xiangmu\House Prices\train.csv')
test=pd.read_csv(r'D:\workplace\xiangmu\House Prices\test.csv')
ntrain=train.shape[0]
ntest=test.shape[0]
datas = pd.concat([train, test], ignore_index = True)
SalePrice=np.log(train.SalePrice)
train_ID = train['Id']
test_ID = test['Id']
# print(datas.info())#查看数据是否有缺失

#------------------------一、数据可视化探索--------------------
#~~~~~先看各变量之间的线性关系
corrmat=train.corr()
fig,ax=plt.subplots(figsize=(12,9))
sns.heatmap(corrmat,vmax=.8,square=True)
plt.show()


#选出前10个相关性最强的
k=10
fig1,ax1=plt.subplots(figsize=(12,12))
cols=corrmat.nlargest(k,'SalePrice')['SalePrice'].index
#查看10个变量的相关系数
cols_cor = datas[cols].corr()
# cm=np.corrcoef(datas[cols].values.T)
sns.set(font_scale=1.25)
hm=sns.heatmap(cols_cor,cbar=True,annot=True,square=True,fmt='.2f',lw=1,annot_kws={'size':10},yticklabels=cols,xticklabels=cols)
plt.show()
#~~~~~~因变量是否服从正态分布
import scipy.stats as stats
#绘制直方图
sns.distplot(a=train.SalePrice,bins=10,fit=stats.norm,norm_hist=None,
             kde_kws={'color': 'black', 'linestyle': '--', 'label': '核密度曲线'},
             fit_kws={'color': 'red', 'linestyle': ':', 'label': '正态密度曲线'})
#显示图例
plt.legend()
plt.show()

#取对数符合正态分布(左偏右偏的话,取对数即可)
sns.distplot(np.log(train.SalePrice),bins=10,fit=stats.norm,norm_hist=None,
             kde_kws={'color': 'black', 'linestyle': '--', 'label': '核密度曲线'},
             fit_kws={'color': 'red', 'linestyle': ':', 'label': '正态密度曲线'})
#显示图例
plt.legend()
#显示图形
plt.show()
#~~~~~~~异常值查看
#1.用线性相关图,散点图来查看,连续的值才会有相关性
import seaborn as sns
sns.pairplot(
    x_vars=['OverallQual','GrLivArea','GarageCars','GarageArea','TotRmsAbvGrd','YearBuilt','TotalBsmtSF','MasVnrArea'],
    y_vars='SalePrice',
    data=train#指定绘图数据集
)
plt.show()

# 中文和负号的正常显示
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False


#-------------------------二、数据清洗-----------------------------
#一般数据处理要将训练集和测试集分开进行,否则可能在测试集的众数与训练集的不同。
#~~~~~~~~~~异常值处理
train.drop(train[(train['OverallQual']<5) & (train['SalePrice']>200000)].index, inplace=True)
train.drop(train[(train['SalePrice']<190000) & (train['GrLivArea']>4000)].index, inplace=True)
train.drop(train[(train['YearBuilt']<1900) & (train['SalePrice']>400000)].index, inplace=True)
train.drop(train[(train['SalePrice']<200000) & (train['TotalBsmtSF']>5000)].index, inplace=True)
train.drop(train[(train['SalePrice']<200000) & (train['GarageArea']>1000)].index, inplace=True)
import seaborn as sns
sns.pairplot(
    x_vars=['OverallQual','GrLivArea','GarageCars','GarageArea','TotRmsAbvGrd','YearBuilt','TotalBsmtSF'],
    y_vars='SalePrice',
    data=train#指定绘图数据集
)
plt.show()

#~~~~~~~~~~~~~~~~因变量对数变换
# train['SalePrice']=np.log(train['SalePrice'])

#~~~~~~~~~~~~~~~~~~~缺失值处理    --确实太多删掉,每一个都要处理,要不是众数替代要不就是0替代,眼部就是None替代。就是要考虑实际情况

print(datas.info())#查看数据是否有缺失
#删除无影响的变量
datas.drop('Id',axis=1, inplace=True)
datas.drop('Utilities',axis=1, inplace=True)
#对字符类型的数据的用none替代
str_cols= ['MiscFeature','Fence','PoolQC','Alley','FireplaceQu','GarageType','GarageFinish',"GarageQual", "GarageCond"]

for col in str_cols:
    datas[col].fillna("None", inplace=True)
del str_cols, col
#对于字符类型用众数填取无值变量
other_color=['Exterior1st','Exterior2nd','MasVnrType','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinSF1',
             'BsmtFinType2','Electrical','KitchenQual','Functional','SaleType']
for col in other_color:
    datas[col].fillna(datas[col].mode()[0], inplace=True)
#对于连续型数值拿0替代
int_cols= ['MasVnrArea','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath','GarageYrBlt','GarageCars','GarageArea']
for col in int_cols:
    datas[col].fillna(0, inplace=True)

#***对于变量,位于同一街道的相邻的房屋往往具有相同的街区面积属性,因此LotFrontage属性的缺失值按照以下方式进行填充。
datas["LotFrontage"] =datas.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))#以Neighborhood分组,fillna()以均值填充
#**哑变量处理
# 特征MSZoning里面有两个不同的离散值RL和RM,那么这一步将去掉MSZoning特征,并新加两个特征MSZoning_RL和MSZoning_RM,其值为0或者1
# 将数值型的Pclass转换为类别型,否则无法对其哑变量处理
datas.MSZoning = datas.MSZoning.astype('category')
# 哑变量处理
dummy = pd.get_dummies(datas[['MSZoning']])
# 水平合并Titanic数据集和哑变量的数据集
datas = pd.concat([datas,dummy], axis = 1)
# 删除原始的变量
datas.drop(['MSZoning'], inplace=True, axis = 1)
print(datas.info())


#-------------------------三、特征工程-----------------------------
"""
类别特征不能直接输入模型,因此要对其进行编码,把它转换成数字。编码有两种方式,对于各个类别中可能存在顺序关系的,用LabelEncoder编码,对于不存在顺序关系的,用get_dummies。
"""
#转换为字符串,即类别型变量
datas['MSSubClass'] = datas['MSSubClass'].apply(str)
datas['YrSold'] = datas['YrSold'].astype(str)
datas['MoSold'] = datas['MoSold'].astype(str)
datas['OverallCond'] = datas['OverallCond'].astype(str)

#~~~~查看因变量是否倾斜以及转换为正态分布(数据的规范化处理有利于排除或者减弱数据异常的影响,从而可以提升模型效率。数据转换的方式有很多种,比较常用的有对数转换,box-cox转换等变换方式。)
"""数据的规范化处理有利于排除或者减弱数据异常的影响,从而可以提升模型效率。数据转换的方式有很多种,比较常用的有对数转换,box-cox转换等变换方式。在数据清洗-目标变量SalePrice环节, 采用了对数转换对数据进行规范化处理,这里,我们将采用box-cox转换。数据转换是针对数据变量进行的特征处理,先找出数值特征。"""
# SalePrice=train['SalePrice']、
datas.drop('SalePrice',axis=1, inplace=True)
#找到数值类特征
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric = []
for i in datas.columns:
    if datas[i].dtype in numeric_dtypes:
        numeric.append(i)
#为所有特征做箱型图,发现数据倾斜较为严重
sns.set_style('white')
f, ax = plt.subplots(figsize=(8, 7))
ax.set_xscale('log')
ax = sns.boxplot(data=datas[numeric] , orient='h', palette='Set1')
ax.xaxis.grid(False)
ax.set(ylabel='Feature names')
ax.set(xlabel='Numeric values')
ax.set(title='Numeric Distribution of Features')
sns.despine(trim=True, left=True)
plt.show()
#找到倾斜的数值特征
skew_datas = datas[numeric].apply(lambda x:x.skew()).sort_values(ascending=False)
high_skew = skew_datas [skew_datas >0.15]
skew_index = list(high_skew.index)
print('There are {} numeric features with skew>0.15'.format(high_skew.shape[0]))
skewness = pd.DataFrame({'Skew':high_skew})
#使用scipy函数boxcox1p来计算Box-Cox转换 或者对数变换也是可以的。
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
for i in skew_index:
    # print(datas[i])
    datas[i]= boxcox1p(datas[i],stats.boxcox_normmax(datas[i]+1))

#再次查看数据倾斜程度
sns.set_style('white')
f,ax = plt.subplots(figsize=(8,7))
ax.set_xscale('log')
ax = sns.boxplot(data=datas[skew_index],orient='h',palette='Set1')
ax.xaxis.grid(False)
ax.set(ylabel='Feature names')
ax.set(xlabel='Numeric values')
ax.set(title='Numeric Distribution of Features')
sns.despine(trim=True,left=True)
plt.show()#发现都已经呈现正态分布

#~~~~~~~采用Lable Enconding对离散型数据进行编码(有顺序的编码)
#自定义顺序,并进行替换
datas = datas.replace({'Street': {'Pave': 1, 'Grvl': 0 },
                             'FireplaceQu': {'Ex': 5,
                                            'Gd': 4,
                                            'TA': 3,
                                            'Fa': 2,
                                            'Po': 1,
                                            'None': 0
                                            },
                             'Fence': {'GdPrv': 2,
                                       'GdWo': 2,
                                       'MnPrv': 1,
                                       'MnWw': 1,
                                       'None': 0},
                             'ExterQual': {'Ex': 5,
                                            'Gd': 4,
                                            'TA': 3,
                                            'Fa': 2,
                                            'Po': 1
                                            },
                             'ExterCond': {'Ex': 5,
                                            'Gd': 4,
                                            'TA': 3,
                                            'Fa': 2,
                                            'Po': 1
                                            },
                             'BsmtQual': {'Ex': 5,
                                            'Gd': 4,
                                            'TA': 3,
                                            'Fa': 2,
                                            'Po': 1,
                                            'None': 0},
                             'BsmtExposure': {'Gd': 3,
                                            'Av': 2,
                                            'Mn': 1,
                                            'No': 0},
                             'BsmtCond': {'Ex': 5,
                                            'Gd': 4,
                                            'TA': 3,
                                            'Fa': 2,
                                            'Po': 1},
                             'GarageQual': {'Ex': 5,
                                            'Gd': 4,
                                            'TA': 3,
                                            'Fa': 2,
                                            'Po': 1,
                                            'None': 0},
                             'GarageCond': {'Ex': 5,
                                            'Gd': 4,
                                            'TA': 3,
                                            'Fa': 2,
                                            'Po': 1,
                                            'None': 0},
                             'KitchenQual': {'Ex': 5,
                                            'Gd': 4,
                                            'TA': 3,
                                            'Fa': 2,
                                            'Po': 1},
                             'Functional': {'Typ': 0,
                                            'Min1': 1,
                                            'Min2': 1,
                                            'Mod': 2,
                                            'Maj1': 3,
                                            'Maj2': 4,
                                            'Sev': 5,
                                            'Sal': 6},
                             'CentralAir': {'Y': 1,
                                            'N': 0},
                             'PavedDrive': {'Y': 1,
                                            'P': 0,
                                            'N': 0}
                            })

#~~~~~~增加特征(增加特征主要是凭借对数据的理解进行进一步的加工, 所以我们可以基于对数据集的理解创建一些特征来帮助我们的模型。)
#基于业务理解增加特征,增加一些特征,比如总面积,就是一些其他面积的和。
#地下室面积总面积
datas['TotalBSF'] = (datas['TotalBsmtSF']+datas['1stFlrSF']+datas['2ndFlrSF']+datas['BsmtUnfSF'])
#全屋浴室加总
datas['Total_Bathrooms'] = (datas['FullBath']+(0.5*datas['HalfBath'])+datas['BsmtFullBath']+(0.5*datas['BsmtHalfBath']))
#门廊加总
datas['Total_porch_sf'] = (datas['OpenPorchSF'] + datas['3SsnPorch'] + datas['EnclosedPorch'] + datas['ScreenPorch'] + datas['WoodDeckSF'])
#车库面积加总
datas['Total_Garage'] = datas['GarageArea']+ datas['GarageCars']
#外部有关面积数据加总
datas['Outside_Area'] = datas['Total_porch_sf'] + datas['PoolArea']
#屋内全部楼层加地下室面积加总
datas['Total_sqr'] = (datas['TotalBSF'] + datas['LowQualFinSF'] + datas['1stFlrSF'] +datas['2ndFlrSF'])
#减法
#建造,售卖时间间隔
datas['YearsSinceRemodel'] = datas['YrSold'].astype(int) - datas['YearBuilt'].astype(int)
#改建,售卖时间间隔
datas['YearsSinceRemodel'] = datas['YrSold'].astype(int) - datas['YearRemodAdd'].astype(int)


datas['Total_Living'] =(datas['TotalBsmtSF']+datas['GrLivArea']+datas['GarageArea'])
print(datas.MasVnrType)

#~~~~~~~目标编码。对于不存在顺序关系的,用get_dummies
datas = pd.get_dummies(datas)
print(datas.info())


#我们通过计算数值特征的对数和平方变换来创建更多的特征
def logs(res,ls):
    m = res.shape[1]
    for l in ls:
        res = res.assign(newcol=pd.Series(np.log(1.01+res[l])).values)
        res.columns.values[m] = l + '_log'
        m += 1
    return res

log_datas = ['LotFrontage','LotArea','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF',
                 'TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea',
                 'BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr',
                 'TotRmsAbvGrd','Fireplaces','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF',
                 'EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal','YearRemodAdd','TotalBSF']
datas = logs(datas,log_datas )
#通过平方转换获得新的特征
def squares(res,ls):
    m = res.shape[1]
    for l in ls:
        res = res.assign(newcol=pd.Series(res[l]*res[l]).values)
        res.columns.values[m] = l + '_sq'
        m += 1
    return res
squared_datas = ['YearRemodAdd', 'LotFrontage_log',
                    'TotalBsmtSF_log', '1stFlrSF_log', '2ndFlrSF_log', 'GrLivArea_log',
                    'GarageCars_log', 'GarageArea_log']
datas = squares(datas,squared_datas)




#~~~~~~~~特征降维,特征太多,需要特征筛选,为避免多重共线问题。下面我们会识别皮尔森相关性系数大于0.9的特征,并将这些特征删除。
#存在一些无意义的特征,需要进行降维
threshold = 0.9
#相关性矩阵
corr_matrix =datas.corr().abs()
#只选择矩阵的上半部分
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
print(upper.head())
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
print('There are %d columns to remove.' % (len(to_drop)))
#发现有63个特征需要删除
datas = datas.drop(columns = to_drop)

#然后将因变量放回,水平合并SalePrice数据集
# SalePrice=np.log(train['SalePrice'])

# datas = pd.concat([datas,SalePrice], axis = 1)
# print(datas.info)
#自此数据提前处理已完成,然后将数据拆分回训练集和测试集
clean_train = datas[:ntrain]
# clean_train = pd.concat([clean_train,np.log(train.SalePrice)], axis = 1)
clean_train = pd.concat([clean_train,SalePrice], axis = 1)
clean_test = datas[ntrain:]
# clean_test = pd.concat([clean_test,np.log(test.SalePrice)], axis = 1)

print(clean_train.info)
print(clean_test.info)


#---------------------------------四、数据建模---------------------------
"""
建模也就是根据所研究的问题选择恰当的算法搭建学习模型,并且基于所设定的模型评价指标,在训练过程中调整模型参数以使得模型的整体性能达到最优。
在Kaggle的比赛中优胜方案通常都会做模型集成(ensemble),模型集成会整合不同模型的预测结果,生成最终预测,集成的模型越多,效果就越好。
"""
#导包
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
#~~~~~建模之前,先划分数据集,定义交叉验证模式以及衡量指标
X = clean_train.drop(columns='SalePrice')
y = clean_train['SalePrice']

#~~~~~定义衡量指标RMSE
def rmse_cv(model):
    rmse= np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv = 5))
    return(rmse)
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=10)

# 定义交叉验证模式
kf = KFold(n_splits=10, random_state=50, shuffle=True)

#~~~~~接下来建立基线模型,在不给定具体参数的情况下,看看各个模型的表现
warnings.filterwarnings('ignore')
# 建立基本模型尝试
# lgb = LGBMRegressor(objective='regression', random_state=50)
# xgb = XGBRegressor(objective='reg:squarederror',random_state=50)
# ridge = make_pipeline(RobustScaler(), RidgeCV(cv=kf))#岭回归
# svr = make_pipeline(RobustScaler(), SVR())#支持向量机
# gbr = GradientBoostingRegressor(random_state=50)#梯度提升回归
# rf = RandomForestRegressor(random_state=50)#随机森林

#~~~~~~~~~~~调参后
lgb = LGBMRegressor(objeactive='regression'
                    ,n_estimators=1655#1841#1200
                    # ,max_depth=8
                    ,max_depth=11#6
                    ,num_leaves=8#10#10
                    ,min_data_in_leaf=3
                    ,max_bin=22#25
                    ,bagging_fraction=0.9#0.8#0.6
                    ,bagging_freq=10#11
                    ,feature_fraction=0.6
                    ,lambda_l1=0.004641588833612777
                    ,lambda_l2=6.4078445308786405#4.641588833612782e-05
                    ,learning_rate=0.06#0.01
                    ,random_state=  42#30#50
                    ,n_jobs=-1)
xgb = XGBRegressor(objective='reg:squarederror' #XGBoost
                   ,n_estimators=1000
                   ,learning_rate=0.02
                   ,max_depth=3
                   ,subsample=0.6
                   ,min_child_weight=3
                   ,colsample_bytree=0.5
                   ,random_state=50
                   ,n_jobs=-1)


ridge = make_pipeline(RobustScaler(), RidgeCV(cv=kf))#岭回归
svr = make_pipeline(RobustScaler(), SVR(C=8,
                                        epsilon= 0.00005,
                                        gamma=0.0008
                                        ))#支持向量回归
gbr = GradientBoostingRegressor(n_estimators=6000 #梯度提升回归
                                ,learning_rate=0.01
                                ,max_depth=4
                                ,max_features='sqrt'
                                ,min_samples_leaf=15
                                ,min_samples_split=10
                                ,loss='huber'
                                ,random_state=50)

rf = RandomForestRegressor(n_estimators=2000 #随机森林
                           ,max_depth=15
                           ,random_state=50
                           ,n_jobs=-1)
# 模型评估
models = [lgb, xgb, ridge, svr, gbr, rf]
model_names = ['lgb','xgb','ridge','svr','gbr','rf']
scores = {}
for i, model in enumerate(models):
    score = rmse_cv(model)
    print('{} rmse score: {:.4f}, rmse std: {:.4f}'.format(model_names[i], score.mean(), score.std()))
    scores[model_names[i]] = (score.mean(), score.std())
rmse_df = pd.DataFrame(scores, index=['rmse_score', 'rmse_std'])
rmse_df.sort_values('rmse_score', axis=1, inplace=True)
print(rmse_df)#发现确实有比之前好一点

#---------------------------模型融合--------------------------
"""
我使用了stacking和blending两种方法,首先通过stacking获得预测结果,然后加上上文6个模型的结果,通过blending赋予权重,得到最终预测结果。
"""

from mlxtend.regressor import StackingCVRegressor
stack_model = StackingCVRegressor(regressors=(lgb, ridge, svr, gbr, rf,xgb), meta_regressor=xgb, cv=10)

# # 模型评估
# models = [lgb, xgb, ridge, svr, gbr, rf,stack_model]
# model_names = ['lgb','xgb','ridge','svr','gbr','rf','Stacking_model']
# scores = {}
# for i, model in enumerate(models):
#     score = rmse_cv(model)
#     print('{} rmse score: {:.4f}, rmse std: {:.4f}'.format(model_names[i], score.mean(), score.std()))
#     scores[model_names[i]] = (score.mean(), score.std())
# rmse_df = pd.DataFrame(scores, index=['rmse_score', 'rmse_std'])
# rmse_df.sort_values('rmse_score', axis=1, inplace=True)
# print(rmse_df)

#评估stacking的效果
def rmse(y, y_pred):
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    return rmse
stack_model.fit(Xtrain, ytrain)
stacking_pred=stack_model.predict(Xtest)
stacking_score = rmse(ytest, stacking_pred)
print(stacking_score)

#blending
def blending(X, y, test):
    lgb.fit(X, y)
    lgb_pred = lgb.predict(test)

    xgb.fit(X, y)
    xgb_pred = xgb.predict(test)

    ridge.fit(X, y)
    ridge_pred = ridge.predict(test)

    svr.fit(X, y)
    svr_pred = svr.predict(test)

    gbr.fit(X, y)
    gbr_pred = gbr.predict(test)

    rf.fit(X, y)
    rf_pred = rf.predict(test)

    stack_model.fit(X, y)
    stack_pred = stack_model.predict(test)

    # 加权求和
    blended_pred = (0.1 * lgb_pred +#0.05
                    0.2 * xgb_pred + #0.1
                    0.05 * ridge_pred +
                    0.25 * svr_pred +
                    0.15 * gbr_pred +
                    0.05 * rf_pred +
                    0.2 * stack_pred)
    return blended_pred
#评估blending的效果
blended_pred = blending(Xtrain, ytrain, Xtest)
blending_score = rmse(ytest, blended_pred)
print(blending_score)
#----------------------------提交最终预测---------------------------------
y_pred = np.exp(blending(X, y, clean_test)) - 1
sample = pd.DataFrame()
sample['Id'] = test_ID
sample['SalePrice'] = y_pred
sample.head(10)
sample.to_csv('saleprice_forecast.csv', index=False)

  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值