代码来源
关于xbgboost的调参可参考这篇文章
或官网
流程如下:
1. 读取测试集,训练集
2. 根据相关性手动选取特征
3. 选取测试集
4. 填充缺失值
5. 处理非数字型数据
6. 训练算法
7. 存储数据
# 引入
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.base import TransformerMixin
class DataFrameImputer(TransformerMixin):
def fit(self, X, y=None):
# 遍历X的列生成数组,如果c的数据类型为np.dtype('O')直接返回X[c].value_counts().index[0](X[c]中最多的那位数),否则返回X[c].median()(X[c]的中值)。
# 生成的Series键值用X.columns的列names表示
self.fill = pd.Series([X[c].value_counts().index[0]
if X[c].dtype == np.dtype('O') else X[c].median() for c in X],
index=X.columns)
return self
# 填补缺失值,按列对应
def transform(self, X, y=None):
return X.fillna(self.fill)
# 读取数据
train_df = pd.read_csv('train.csv', header=0)
test_df = pd.read_csv('test.csv', header=0)
# 手动特征选择
feature_columns_to_use = ['Pclass','Sex','Age','Fare','Parch']
# csv文件中sex列非数字项
nonnumeric_columns = ['Sex']
# 将训练集和测试集按特征选择组合起来
# 二者的分布稍不同
big_X = train_df[feature_columns_to_use].append(test_df[feature_columns_to_use])
# fit_transform 是合并了fit函数和transform函数对数据的缺失值进行填充
big_X_imputed = DataFrameImputer().fit_transform(big_X)
# XGBoost无法处理非数字数据,需要转化,将字符串转化为整型
le = LabelEncoder()
for feature in nonnumeric_columns:
big_X_imputed[feature] = le.fit_transform(big_X_imputed[feature])
# 准备训练数据:训练集,测试集,训练集label
train_X = big_X_imputed[0:train_df.shape[0]].as_matrix()
test_X = big_X_imputed[train_df.shape[0]::].as_matrix()
train_y = train_df['Survived']
# 训练
gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(train_X, train_y)
# 预测
predictions = gbm.predict(test_X)
# 存为csv格式
submission = pd.DataFrame({ 'PassengerId': test_df['PassengerId'],
'Survived': predictions })
submission.to_csv("submission.csv", index=False)