分别使用回归树与XGBoost回归,预测实验三中给出的Advertising.csv数据集,并与传统线性回归预测方法进行比较。
具体要求:
- 首先进行数据标准化。
- 测试集和训练集比例分别为30%和70%。
- 使用均方误差来评价预测的好坏程度。
- 对于XGBoost请尝试使用交叉验证找到n_estimators的最优参数值。n_estimators的取值范围为[100-1000]。
回归树:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
train_path = r'Advertising.csv'
def read_train_file(path):
data = pd.read_csv(path)
return data
#回归树
def RegressionTree(data):
X = data[['TV', 'Radio', 'Newspaper']]
Y = data['Sales']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
dt_reg = DecisionTreeRegressor(max_depth=5)
dt_reg.fit(x_train, y_train)
score = dt_reg.score(x_test, y_test)
print("回归树预测准确率: ",score,"%")
y_pred = dt_reg.predict(x_test)
print("回归树均方误差:",mean_squared_error(y_test, y_pred))
if __name__ == '__main__':
print("read train file.....")
data=read_train_file(train_path)
RegressionTree(data)
XGBoost回归:
通过交叉验证找到n_estimators的最优参数值(调参):
import pandas as pd
from pylab import *
from sklearn.model_selection import train_test_split
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
train_path = r'Advertising.csv'
data = pd.read_csv(train_path)
X = data[['TV', 'Radio', 'Newspaper']]
Y = data['Sales']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
def modelfit(alg, dtrain,dlable,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
def rmse(predictions, targets):
return np.sqrt(((predictions - targets) ** 2).mean())
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(dtrain, label=dlable)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics='rmse', early_stopping_rounds=early_stopping_rounds,
callbacks=[xgb.callback.print_evaluation(show_stdv=True), xgb.callback.early_stop(early_stopping_rounds)])
print("n_estimators:",cvresult.shape[0])
alg.set_params(n_estimators=cvresult.shape[0])
# Fit the algorithm on the data
alg.fit(dtrain, dlable, eval_metric='rmse')
# Predict training set:
#preds = alg.predict(dtrain)
# Print model report:
#print("\nModel Report:",rmse(preds,dtrain.get_label()))
xgb1 = XGBRegressor(
learning_rate =0.1,
n_estimators=1000,
max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'gpu:reg:linear',
nthread=4,
scale_pos_weight=1,
seed=27)
modelfit(xgb1, X, Y,)
XGBoost模型:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
train_path = r'Advertising.csv'
def read_train_file(path):
data = pd.read_csv(path)
return data
def xgboost(data):
X = data[['TV', 'Radio', 'Newspaper']]
# 标准化特征值
sc = StandardScaler()
sc.fit(X)
X = sc.transform(X)
Y = data['Sales']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
#n_estimators的值已调出最优值 187
model = xgb.XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators= 187 , silent=False, objective='reg:gamma')
model.fit(x_train, y_train)
# 对测试集进行预测
score = model.score(x_test, y_test)
print("xgboost预测准确率: ", score, "%")
y_pred = model.predict(x_test)
print("xgboost均方误差:", mean_squared_error(y_test, y_pred))
if __name__ == '__main__':
print("read train file.....")
data=read_train_file(train_path)
xgboost(data)