这是一个通过机器学习模型预测房价的一个项目,希望可以从中了解大概的数据挖掘流程。
最终通过上传结果排名前8%
import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import warnings warnings.filterwarnings('ignore') train=pd.read_csv(r'D:\workplace\xiangmu\House Prices\train.csv') test=pd.read_csv(r'D:\workplace\xiangmu\House Prices\test.csv') ntrain=train.shape[0] ntest=test.shape[0] datas = pd.concat([train, test], ignore_index = True) SalePrice=np.log(train.SalePrice) train_ID = train['Id'] test_ID = test['Id'] # print(datas.info())#查看数据是否有缺失 #------------------------一、数据可视化探索-------------------- #~~~~~先看各变量之间的线性关系 corrmat=train.corr() fig,ax=plt.subplots(figsize=(12,9)) sns.heatmap(corrmat,vmax=.8,square=True) plt.show() #选出前10个相关性最强的 k=10 fig1,ax1=plt.subplots(figsize=(12,12)) cols=corrmat.nlargest(k,'SalePrice')['SalePrice'].index #查看10个变量的相关系数 cols_cor = datas[cols].corr() # cm=np.corrcoef(datas[cols].values.T) sns.set(font_scale=1.25) hm=sns.heatmap(cols_cor,cbar=True,annot=True,square=True,fmt='.2f',lw=1,annot_kws={'size':10},yticklabels=cols,xticklabels=cols) plt.show() #~~~~~~因变量是否服从正态分布 import scipy.stats as stats #绘制直方图 sns.distplot(a=train.SalePrice,bins=10,fit=stats.norm,norm_hist=None, kde_kws={'color': 'black', 'linestyle': '--', 'label': '核密度曲线'}, fit_kws={'color': 'red', 'linestyle': ':', 'label': '正态密度曲线'}) #显示图例 plt.legend() plt.show() #取对数符合正态分布(左偏右偏的话,取对数即可) sns.distplot(np.log(train.SalePrice),bins=10,fit=stats.norm,norm_hist=None, kde_kws={'color': 'black', 'linestyle': '--', 'label': '核密度曲线'}, fit_kws={'color': 'red', 'linestyle': ':', 'label': '正态密度曲线'}) #显示图例 plt.legend() #显示图形 plt.show() #~~~~~~~异常值查看 #1.用线性相关图,散点图来查看,连续的值才会有相关性 import seaborn as sns sns.pairplot( x_vars=['OverallQual','GrLivArea','GarageCars','GarageArea','TotRmsAbvGrd','YearBuilt','TotalBsmtSF','MasVnrArea'], y_vars='SalePrice', data=train#指定绘图数据集 ) plt.show() # 中文和负号的正常显示 plt.rcParams['font.sans-serif'] = ['Microsoft YaHei'] plt.rcParams['axes.unicode_minus'] = False #-------------------------二、数据清洗----------------------------- #一般数据处理要将训练集和测试集分开进行,否则可能在测试集的众数与训练集的不同。 #~~~~~~~~~~异常值处理 train.drop(train[(train['OverallQual']<5) & (train['SalePrice']>200000)].index, inplace=True) train.drop(train[(train['SalePrice']<190000) & (train['GrLivArea']>4000)].index, inplace=True) train.drop(train[(train['YearBuilt']<1900) & (train['SalePrice']>400000)].index, inplace=True) train.drop(train[(train['SalePrice']<200000) & (train['TotalBsmtSF']>5000)].index, inplace=True) train.drop(train[(train['SalePrice']<200000) & (train['GarageArea']>1000)].index, inplace=True) import seaborn as sns sns.pairplot( x_vars=['OverallQual','GrLivArea','GarageCars','GarageArea','TotRmsAbvGrd','YearBuilt','TotalBsmtSF'], y_vars='SalePrice', data=train#指定绘图数据集 ) plt.show() #~~~~~~~~~~~~~~~~因变量对数变换 # train['SalePrice']=np.log(train['SalePrice']) #~~~~~~~~~~~~~~~~~~~缺失值处理 --确实太多删掉,每一个都要处理,要不是众数替代要不就是0替代,眼部就是None替代。就是要考虑实际情况 print(datas.info())#查看数据是否有缺失 #删除无影响的变量 datas.drop('Id',axis=1, inplace=True) datas.drop('Utilities',axis=1, inplace=True) #对字符类型的数据的用none替代 str_cols= ['MiscFeature','Fence','PoolQC','Alley','FireplaceQu','GarageType','GarageFinish',"GarageQual", "GarageCond"] for col in str_cols: datas[col].fillna("None", inplace=True) del str_cols, col #对于字符类型用众数填取无值变量 other_color=['Exterior1st','Exterior2nd','MasVnrType','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinSF1', 'BsmtFinType2','Electrical','KitchenQual','Functional','SaleType'] for col in other_color: datas[col].fillna(datas[col].mode()[0], inplace=True) #对于连续型数值拿0替代 int_cols= ['MasVnrArea','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','BsmtFullBath','BsmtHalfBath','GarageYrBlt','GarageCars','GarageArea'] for col in int_cols: datas[col].fillna(0, inplace=True) #***对于变量,位于同一街道的相邻的房屋往往具有相同的街区面积属性,因此LotFrontage属性的缺失值按照以下方式进行填充。 datas["LotFrontage"] =datas.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))#以Neighborhood分组,fillna()以均值填充 #**哑变量处理 # 特征MSZoning里面有两个不同的离散值RL和RM,那么这一步将去掉MSZoning特征,并新加两个特征MSZoning_RL和MSZoning_RM,其值为0或者1 # 将数值型的Pclass转换为类别型,否则无法对其哑变量处理 datas.MSZoning = datas.MSZoning.astype('category') # 哑变量处理 dummy = pd.get_dummies(datas[['MSZoning']]) # 水平合并Titanic数据集和哑变量的数据集 datas = pd.concat([datas,dummy], axis = 1) # 删除原始的变量 datas.drop(['MSZoning'], inplace=True, axis = 1) print(datas.info()) #-------------------------三、特征工程----------------------------- """ 类别特征不能直接输入模型,因此要对其进行编码,把它转换成数字。编码有两种方式,对于各个类别中可能存在顺序关系的,用LabelEncoder编码,对于不存在顺序关系的,用get_dummies。 """ #转换为字符串,即类别型变量 datas['MSSubClass'] = datas['MSSubClass'].apply(str) datas['YrSold'] = datas['YrSold'].astype(str) datas['MoSold'] = datas['MoSold'].astype(str) datas['OverallCond'] = datas['OverallCond'].astype(str) #~~~~查看因变量是否倾斜以及转换为正态分布(数据的规范化处理有利于排除或者减弱数据异常的影响,从而可以提升模型效率。数据转换的方式有很多种,比较常用的有对数转换,box-cox转换等变换方式。) """数据的规范化处理有利于排除或者减弱数据异常的影响,从而可以提升模型效率。数据转换的方式有很多种,比较常用的有对数转换,box-cox转换等变换方式。在数据清洗-目标变量SalePrice环节, 采用了对数转换对数据进行规范化处理,这里,我们将采用box-cox转换。数据转换是针对数据变量进行的特征处理,先找出数值特征。""" # SalePrice=train['SalePrice']、 datas.drop('SalePrice',axis=1, inplace=True) #找到数值类特征 numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] numeric = [] for i in datas.columns: if datas[i].dtype in numeric_dtypes: numeric.append(i) #为所有特征做箱型图,发现数据倾斜较为严重 sns.set_style('white') f, ax = plt.subplots(figsize=(8, 7)) ax.set_xscale('log') ax = sns.boxplot(data=datas[numeric] , orient='h', palette='Set1') ax.xaxis.grid(False) ax.set(ylabel='Feature names') ax.set(xlabel='Numeric values') ax.set(title='Numeric Distribution of Features') sns.despine(trim=True, left=True) plt.show() #找到倾斜的数值特征 skew_datas = datas[numeric].apply(lambda x:x.skew()).sort_values(ascending=False) high_skew = skew_datas [skew_datas >0.15] skew_index = list(high_skew.index) print('There are {} numeric features with skew>0.15'.format(high_skew.shape[0])) skewness = pd.DataFrame({'Skew':high_skew}) #使用scipy函数boxcox1p来计算Box-Cox转换 或者对数变换也是可以的。 from scipy.special import boxcox1p from scipy.stats import boxcox_normmax for i in skew_index: # print(datas[i]) datas[i]= boxcox1p(datas[i],stats.boxcox_normmax(datas[i]+1)) #再次查看数据倾斜程度 sns.set_style('white') f,ax = plt.subplots(figsize=(8,7)) ax.set_xscale('log') ax = sns.boxplot(data=datas[skew_index],orient='h',palette='Set1') ax.xaxis.grid(False) ax.set(ylabel='Feature names') ax.set(xlabel='Numeric values') ax.set(title='Numeric Distribution of Features') sns.despine(trim=True,left=True) plt.show()#发现都已经呈现正态分布 #~~~~~~~采用Lable Enconding对离散型数据进行编码(有顺序的编码) #自定义顺序,并进行替换 datas = datas.replace({'Street': {'Pave': 1, 'Grvl': 0 }, 'FireplaceQu': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0 }, 'Fence': {'GdPrv': 2, 'GdWo': 2, 'MnPrv': 1, 'MnWw': 1, 'None': 0}, 'ExterQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1 }, 'ExterCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1 }, 'BsmtQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0}, 'BsmtExposure': {'Gd': 3, 'Av': 2, 'Mn': 1, 'No': 0}, 'BsmtCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1}, 'GarageQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0}, 'GarageCond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0}, 'KitchenQual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1}, 'Functional': {'Typ': 0, 'Min1': 1, 'Min2': 1, 'Mod': 2, 'Maj1': 3, 'Maj2': 4, 'Sev': 5, 'Sal': 6}, 'CentralAir': {'Y': 1, 'N': 0}, 'PavedDrive': {'Y': 1, 'P': 0, 'N': 0} }) #~~~~~~增加特征(增加特征主要是凭借对数据的理解进行进一步的加工, 所以我们可以基于对数据集的理解创建一些特征来帮助我们的模型。) #基于业务理解增加特征,增加一些特征,比如总面积,就是一些其他面积的和。 #地下室面积总面积 datas['TotalBSF'] = (datas['TotalBsmtSF']+datas['1stFlrSF']+datas['2ndFlrSF']+datas['BsmtUnfSF']) #全屋浴室加总 datas['Total_Bathrooms'] = (datas['FullBath']+(0.5*datas['HalfBath'])+datas['BsmtFullBath']+(0.5*datas['BsmtHalfBath'])) #门廊加总 datas['Total_porch_sf'] = (datas['OpenPorchSF'] + datas['3SsnPorch'] + datas['EnclosedPorch'] + datas['ScreenPorch'] + datas['WoodDeckSF']) #车库面积加总 datas['Total_Garage'] = datas['GarageArea']+ datas['GarageCars'] #外部有关面积数据加总 datas['Outside_Area'] = datas['Total_porch_sf'] + datas['PoolArea'] #屋内全部楼层加地下室面积加总 datas['Total_sqr'] = (datas['TotalBSF'] + datas['LowQualFinSF'] + datas['1stFlrSF'] +datas['2ndFlrSF']) #减法 #建造,售卖时间间隔 datas['YearsSinceRemodel'] = datas['YrSold'].astype(int) - datas['YearBuilt'].astype(int) #改建,售卖时间间隔 datas['YearsSinceRemodel'] = datas['YrSold'].astype(int) - datas['YearRemodAdd'].astype(int) datas['Total_Living'] =(datas['TotalBsmtSF']+datas['GrLivArea']+datas['GarageArea']) print(datas.MasVnrType) #~~~~~~~目标编码。对于不存在顺序关系的,用get_dummies datas = pd.get_dummies(datas) print(datas.info()) #我们通过计算数值特征的对数和平方变换来创建更多的特征 def logs(res,ls): m = res.shape[1] for l in ls: res = res.assign(newcol=pd.Series(np.log(1.01+res[l])).values) res.columns.values[m] = l + '_log' m += 1 return res log_datas = ['LotFrontage','LotArea','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF', 'TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea', 'BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr', 'TotRmsAbvGrd','Fireplaces','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF', 'EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal','YearRemodAdd','TotalBSF'] datas = logs(datas,log_datas ) #通过平方转换获得新的特征 def squares(res,ls): m = res.shape[1] for l in ls: res = res.assign(newcol=pd.Series(res[l]*res[l]).values) res.columns.values[m] = l + '_sq' m += 1 return res squared_datas = ['YearRemodAdd', 'LotFrontage_log', 'TotalBsmtSF_log', '1stFlrSF_log', '2ndFlrSF_log', 'GrLivArea_log', 'GarageCars_log', 'GarageArea_log'] datas = squares(datas,squared_datas) #~~~~~~~~特征降维,特征太多,需要特征筛选,为避免多重共线问题。下面我们会识别皮尔森相关性系数大于0.9的特征,并将这些特征删除。 #存在一些无意义的特征,需要进行降维 threshold = 0.9 #相关性矩阵 corr_matrix =datas.corr().abs() #只选择矩阵的上半部分 upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)) print(upper.head()) to_drop = [column for column in upper.columns if any(upper[column] > threshold)] print('There are %d columns to remove.' % (len(to_drop))) #发现有63个特征需要删除 datas = datas.drop(columns = to_drop) #然后将因变量放回,水平合并SalePrice数据集 # SalePrice=np.log(train['SalePrice']) # datas = pd.concat([datas,SalePrice], axis = 1) # print(datas.info) #自此数据提前处理已完成,然后将数据拆分回训练集和测试集 clean_train = datas[:ntrain] # clean_train = pd.concat([clean_train,np.log(train.SalePrice)], axis = 1) clean_train = pd.concat([clean_train,SalePrice], axis = 1) clean_test = datas[ntrain:] # clean_test = pd.concat([clean_test,np.log(test.SalePrice)], axis = 1) print(clean_train.info) print(clean_test.info) #---------------------------------四、数据建模--------------------------- """ 建模也就是根据所研究的问题选择恰当的算法搭建学习模型,并且基于所设定的模型评价指标,在训练过程中调整模型参数以使得模型的整体性能达到最优。 在Kaggle的比赛中优胜方案通常都会做模型集成(ensemble),模型集成会整合不同模型的预测结果,生成最终预测,集成的模型越多,效果就越好。 """ #导包 from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV from sklearn.model_selection import cross_val_score from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor from lightgbm import LGBMRegressor from xgboost import XGBRegressor from sklearn.svm import SVR from sklearn.pipeline import make_pipeline from sklearn.preprocessing import RobustScaler from sklearn.model_selection import KFold, GridSearchCV, cross_val_score, train_test_split from sklearn.metrics import mean_squared_error #~~~~~建模之前,先划分数据集,定义交叉验证模式以及衡量指标 X = clean_train.drop(columns='SalePrice') y = clean_train['SalePrice'] #~~~~~定义衡量指标RMSE def rmse_cv(model): rmse= np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv = 5)) return(rmse) Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=10) # 定义交叉验证模式 kf = KFold(n_splits=10, random_state=50, shuffle=True) #~~~~~接下来建立基线模型,在不给定具体参数的情况下,看看各个模型的表现 warnings.filterwarnings('ignore') # 建立基本模型尝试 # lgb = LGBMRegressor(objective='regression', random_state=50) # xgb = XGBRegressor(objective='reg:squarederror',random_state=50) # ridge = make_pipeline(RobustScaler(), RidgeCV(cv=kf))#岭回归 # svr = make_pipeline(RobustScaler(), SVR())#支持向量机 # gbr = GradientBoostingRegressor(random_state=50)#梯度提升回归 # rf = RandomForestRegressor(random_state=50)#随机森林 #~~~~~~~~~~~调参后 lgb = LGBMRegressor(objeactive='regression' ,n_estimators=1655#1841#1200 # ,max_depth=8 ,max_depth=11#6 ,num_leaves=8#10#10 ,min_data_in_leaf=3 ,max_bin=22#25 ,bagging_fraction=0.9#0.8#0.6 ,bagging_freq=10#11 ,feature_fraction=0.6 ,lambda_l1=0.004641588833612777 ,lambda_l2=6.4078445308786405#4.641588833612782e-05 ,learning_rate=0.06#0.01 ,random_state= 42#30#50 ,n_jobs=-1) xgb = XGBRegressor(objective='reg:squarederror' #XGBoost ,n_estimators=1000 ,learning_rate=0.02 ,max_depth=3 ,subsample=0.6 ,min_child_weight=3 ,colsample_bytree=0.5 ,random_state=50 ,n_jobs=-1) ridge = make_pipeline(RobustScaler(), RidgeCV(cv=kf))#岭回归 svr = make_pipeline(RobustScaler(), SVR(C=8, epsilon= 0.00005, gamma=0.0008 ))#支持向量回归 gbr = GradientBoostingRegressor(n_estimators=6000 #梯度提升回归 ,learning_rate=0.01 ,max_depth=4 ,max_features='sqrt' ,min_samples_leaf=15 ,min_samples_split=10 ,loss='huber' ,random_state=50) rf = RandomForestRegressor(n_estimators=2000 #随机森林 ,max_depth=15 ,random_state=50 ,n_jobs=-1) # 模型评估 models = [lgb, xgb, ridge, svr, gbr, rf] model_names = ['lgb','xgb','ridge','svr','gbr','rf'] scores = {} for i, model in enumerate(models): score = rmse_cv(model) print('{} rmse score: {:.4f}, rmse std: {:.4f}'.format(model_names[i], score.mean(), score.std())) scores[model_names[i]] = (score.mean(), score.std()) rmse_df = pd.DataFrame(scores, index=['rmse_score', 'rmse_std']) rmse_df.sort_values('rmse_score', axis=1, inplace=True) print(rmse_df)#发现确实有比之前好一点 #---------------------------模型融合-------------------------- """ 我使用了stacking和blending两种方法,首先通过stacking获得预测结果,然后加上上文6个模型的结果,通过blending赋予权重,得到最终预测结果。 """ from mlxtend.regressor import StackingCVRegressor stack_model = StackingCVRegressor(regressors=(lgb, ridge, svr, gbr, rf,xgb), meta_regressor=xgb, cv=10) # # 模型评估 # models = [lgb, xgb, ridge, svr, gbr, rf,stack_model] # model_names = ['lgb','xgb','ridge','svr','gbr','rf','Stacking_model'] # scores = {} # for i, model in enumerate(models): # score = rmse_cv(model) # print('{} rmse score: {:.4f}, rmse std: {:.4f}'.format(model_names[i], score.mean(), score.std())) # scores[model_names[i]] = (score.mean(), score.std()) # rmse_df = pd.DataFrame(scores, index=['rmse_score', 'rmse_std']) # rmse_df.sort_values('rmse_score', axis=1, inplace=True) # print(rmse_df) #评估stacking的效果 def rmse(y, y_pred): rmse = np.sqrt(mean_squared_error(y, y_pred)) return rmse stack_model.fit(Xtrain, ytrain) stacking_pred=stack_model.predict(Xtest) stacking_score = rmse(ytest, stacking_pred) print(stacking_score) #blending def blending(X, y, test): lgb.fit(X, y) lgb_pred = lgb.predict(test) xgb.fit(X, y) xgb_pred = xgb.predict(test) ridge.fit(X, y) ridge_pred = ridge.predict(test) svr.fit(X, y) svr_pred = svr.predict(test) gbr.fit(X, y) gbr_pred = gbr.predict(test) rf.fit(X, y) rf_pred = rf.predict(test) stack_model.fit(X, y) stack_pred = stack_model.predict(test) # 加权求和 blended_pred = (0.1 * lgb_pred +#0.05 0.2 * xgb_pred + #0.1 0.05 * ridge_pred + 0.25 * svr_pred + 0.15 * gbr_pred + 0.05 * rf_pred + 0.2 * stack_pred) return blended_pred #评估blending的效果 blended_pred = blending(Xtrain, ytrain, Xtest) blending_score = rmse(ytest, blended_pred) print(blending_score) #----------------------------提交最终预测--------------------------------- y_pred = np.exp(blending(X, y, clean_test)) - 1 sample = pd.DataFrame() sample['Id'] = test_ID sample['SalePrice'] = y_pred sample.head(10) sample.to_csv('saleprice_forecast.csv', index=False)