XGBoost实践
下图为官网上xgboost的简介:
下面给出利用xgboost来预测泰坦尼克号存活的实例来了解xgboost
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# 载入数据
data = pd.read_csv('Titanic.train.csv')
# 数据清洗
## 将性别一列转换成编码数据
data['Sex'] = data['Sex'].map({'female': 0, 'male': 1}).astype(int)
## 补齐船票价格缺失值(将缺失值补成相应等级的均值)
if len(data.Fare[data.Fare.isnull()]) > 0:
fare = np.zeros(3)
for f in range(0, 3):
fare[f] = data[data.Pclass == f + 1]['Fare'].dropna().median()
for f in range(0, 3): # loop 0 to 2
data.loc[(data.Fare.isnull()) & (data.Pclass == f + 1), 'Fare'] = fare[f]
## 年龄:使用均值代替缺失值(也可使用随机森林预测缺失年龄)
mean_age = data['Age'].dropna().mean()
data.loc[(data.Age.isnull()), 'Age'] = mean_age
# 添加城市的缺失值
data.loc[(data.Embarked.isnull()), 'Embarked'] = 'S' # 保留缺失出发城市
embarked_data = pd.get_dummies(data.Embarked)
embarked_data = embarked_data.rename(columns=lambda x: 'Embarked_' + str(x))
## 连接两个表
data = pd.concat([data, embarked_data], axis=1)
# 开始利用xgboost进行训练
x = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
y = data['Survived']
## 分割数据集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)
## 将数据转换成xgboost要求的格式
data_train = xgb.DMatrix(x_train, label=y_train)
data_test = xgb.DMatrix(x_test, label=y_test)
watch_list = [(data_test, 'eval'), (data_train, 'train')]
param = {'max_depth': 6, 'eta': 0.8, 'silent': 1, 'objective': 'binary:logistic'}
## 利用xgboost对数据进行训练
bst = xgb.train(param, data_train, num_boost_round=100, evals=watch_list)
## 对测试集进行预测
y_hat = bst.predict(data_test)
y_hat[y_hat > 0.5] = 1
y_hat[~(y_hat > 0.5)] = 0
## 计算精度
xgb_acc = accuracy_score(y_test, y_hat)