房价预测5--Regression--Boosting(AdaBoost / xgboost)

1.数据处理

import numpy as np
import pandas as pd

train_data = pd.read_csv('E:/机器学习/my_code_kaggle/lesson2/input/train.csv',index_col = 0)
test_data = pd.read_csv('E:/机器学习/my_code_kaggle/lesson2/input/test.csv',index_col = 0)

y_train = train_data['SalePrice']
X_train = train_data.drop(['SalePrice'],axis = 1)

#判断平滑性
#y_train.hist()
#发现平滑性不怎么样,最好分布图类似于一个正态分布就比较好
y_train_log = np.log1p(y_train)
#y_train_log.hist()

#把数据拼接在一起处理
data = pd.concat((X_train,test_data),axis = 0)

#MSSubClass属性的数值无意义,作为类别标签看待,转为字符串
#MSSubClass是Categorical类型,用One-Hot的方法来表示
#pandas自带get_dummies方法,一键做到One-Hot
data['MSSubClass'] = data['MSSubClass'].astype(str)
#print(data['MSSubClass'].value_counts())


#把MSSubClass换成字符串标签了之后,就和其他的英文分类差不多了,直接dummy
dummy_data = pd.get_dummies(data)#属性由79个变成了302个,1460*303


#查看各个属性缺失值的个数,用平均值填补缺失值
#print(dummy_train.isnull().sum().sort_values(ascending = False).head(10))
mean_cols = dummy_data.mean()
dummy_data = dummy_data.fillna(mean_cols)#用mean_cols来填满空缺值
#print(dummy_data.isnull().sum().sum())#确定没有空缺值了

#做regression的时候最好先标准化一下
#标准化numerical类型数据,不是One-Hot变成的numerical
numeric_cols = data.columns[data.dtypes != 'object']
#print(numeric_cols)#len(numeric_cols) = 36

numeric_col_means = dummy_data.loc[:,numeric_cols].mean()
numeric_col_std = dummy_data.loc[:,numeric_cols].std()
dummy_data.loc[:,numeric_cols] = (dummy_data.loc[:,numeric_cols]-numeric_col_means)/numeric_col_std

#合在一起处理完特征之后,分开测试集和训练集
dummy_train = dummy_data.loc[train_data.index,:]
dummy_test = dummy_data.loc[test_data.index,:]

2.弱分类器还是用ridge

#弱分类器ridge
from sklearn import linear_model
ridge = linear_model.Ridge(alpha = 15)
ridge.fit(dummy_train,y_train_log)

3.用AdaBoost

##Boosting
#AdaBoodt
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import cross_val_score

params = [2,3,4,5,6,7,8,9,10]
test_scores = []
for param in params:
    clf = AdaBoostRegressor(n_estimators=param, base_estimator=ridge)
    test_score = np.sqrt(-cross_val_score(clf, dummy_train, y_train_log, cv=10, scoring='neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))


#显示结果
import matplotlib.pyplot as plt
plt.plot(params,test_scores)
plt.title('n_estimators vs CV Error') 

在这里插入图片描述
在这里插入图片描述
均方误差越来越大,而且每次跑出来的结果都不一样,这应该是cross validation的原因,加上adaboost本身就不是很稳定
扩充了弱分类器的个数,发现均方误差也是一个递增的趋势
在这里插入图片描述
这么看好像分类器个数越少越好???
有的时候确实能够在35的时候看到最佳点,但是这基本都是随缘,而且均方误差来看也不怎么样
在这里插入图片描述
暴力的用2到50全部算了一遍,均方误差还是一个递增的趋势
在这里插入图片描述
可能还是把个数控制在10个以内比较好?请大神指教!!!

4.用xgboost

##Boosting
#XGBoost
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score

params = [1,2,3,4,5,6]
test_scores = []
for param in params:
    clf = XGBRegressor(max_depth = param)
    test_score = np.sqrt(-cross_val_score(clf,dummy_train,y_train_log,cv = 10,scoring = 'neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))
    

#显示结果
import matplotlib.pyplot as plt
plt.plot(params,test_scores)
plt.title('n_estimators vs CV Error') 

在这里插入图片描述
可以看到当深度为5的时候,均方误差已经低于0.13了课件xgboost是目前最好的模型

5.完整代码

import numpy as np
import pandas as pd

train_data = pd.read_csv('E:/机器学习/my_code_kaggle/lesson2/input/train.csv',index_col = 0)
test_data = pd.read_csv('E:/机器学习/my_code_kaggle/lesson2/input/test.csv',index_col = 0)

y_train = train_data['SalePrice']
X_train = train_data.drop(['SalePrice'],axis = 1)

#判断平滑性
#y_train.hist()
#发现平滑性不怎么样,最好分布图类似于一个正态分布就比较好
y_train_log = np.log1p(y_train)
#y_train_log.hist()

#把数据拼接在一起处理
data = pd.concat((X_train,test_data),axis = 0)

#MSSubClass属性的数值无意义,作为类别标签看待,转为字符串
#MSSubClass是Categorical类型,用One-Hot的方法来表示
#pandas自带get_dummies方法,一键做到One-Hot
data['MSSubClass'] = data['MSSubClass'].astype(str)
#print(data['MSSubClass'].value_counts())


#把MSSubClass换成字符串标签了之后,就和其他的英文分类差不多了,直接dummy
dummy_data = pd.get_dummies(data)#属性由79个变成了302个,1460*303


#查看各个属性缺失值的个数,用平均值填补缺失值
#print(dummy_train.isnull().sum().sort_values(ascending = False).head(10))
mean_cols = dummy_data.mean()
dummy_data = dummy_data.fillna(mean_cols)#用mean_cols来填满空缺值
#print(dummy_data.isnull().sum().sum())#确定没有空缺值了

#做regression的时候最好先标准化一下
#标准化numerical类型数据,不是One-Hot变成的numerical
numeric_cols = data.columns[data.dtypes != 'object']
#print(numeric_cols)#len(numeric_cols) = 36

numeric_col_means = dummy_data.loc[:,numeric_cols].mean()
numeric_col_std = dummy_data.loc[:,numeric_cols].std()
dummy_data.loc[:,numeric_cols] = (dummy_data.loc[:,numeric_cols]-numeric_col_means)/numeric_col_std

#合在一起处理完特征之后,分开测试集和训练集
dummy_train = dummy_data.loc[train_data.index,:]
dummy_test = dummy_data.loc[test_data.index,:]

#弱分类器ridge
from sklearn import linear_model
ridge = linear_model.Ridge(alpha = 15)
ridge.fit(dummy_train,y_train_log)

##Boosting
#AdaBoodt

#from sklearn.ensemble import AdaBoostRegressor
#from sklearn.model_selection import cross_val_score
#
#params = [10,15,20,25,30,35,40]
#test_scores = []
#for param in params:
#    clf = AdaBoostRegressor(n_estimators=param, base_estimator=ridge)
#    test_score = np.sqrt(-cross_val_score(clf, dummy_train, y_train_log, cv=10, scoring='neg_mean_squared_error'))
#    test_scores.append(np.mean(test_score))


##Boosting
#XGBoost
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score

params = [1,2,3,4,5,6]
test_scores = []
for param in params:
    clf = XGBRegressor(max_depth = param)
    test_score = np.sqrt(-cross_val_score(clf,dummy_train,y_train_log,cv = 10,scoring = 'neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))
    

#显示结果
import matplotlib.pyplot as plt
plt.plot(params,test_scores)
plt.title('n_estimators vs CV Error') 

综上所述:

模型score
Ridge(alpha = 15)均方误差0.135
Random Forest(max_feature = 0.3)均方误差0.137
AdaBoost(参数是个谜!)均方误差0.14
xgboost(depth = 5)均方误差0.125
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值