1、train.csv
这里采用的数据集来自于Kaggle | Allstate Claims Severity比赛,这里的训练集如下所示,有116个离散特征(cat1-cat116),14个连续特征(cont1 -cont14)
2、xgboost代码分析
import numpy as np
import pandas as pd
import xgboost as xgb
import operator
import matplotlib.pyplot as plt
# 创建 features 矩阵,用于绘制 features importance (对应的分数)
# 生成 i feat q 格式的数据(i++)
def ceate_feature_map(features):
outfile = open('xgb.fmap', 'w')
i = 0
for feat in features:
outfile.write('{0}\t{1}\tq\n'.format(i, feat))
i = i + 1
outfile.close()
if __name__ == '__main__':
train = pd.read_csv("../input/train.csv")
# 离散特征用字符串表示,先要对其进行数值化
# 其中train["cat1"] 为 index+values
# pd.factorize(train[column].values , sort=True):(array([0, 0, 0, ..., 0, 0, 1]), array(['A', 'B'], dtype=object))
cat_sel = [n for n in train.columns if n.startswith('cat')] #类别特征数值化
for column in cat_sel:
train[column] = pd.factorize(train[column].values , sort=True)[0] + 1
params = {
'min_child_weight': 100,
'eta': 0.02,
'colsample_bytree': 0.7,
'max_depth': 12,
'subsample': 0.7,
'alpha': 1,
'gamma': 1,
'silent': 1,
'verbose_eval': True,
'seed': 12
}
rounds = 10
# label y
y = train['loss']
# 去除不相关向量
X = train.drop(['loss', 'id'], 1)
xgtrain = xgb.DMatrix(X, label=y)
bst = xgb.train(params, xgtrain, num_boost_round=rounds)
# 类别特征数值化
# 过滤非需要的的target
features = [x for x in train.columns if x not in ['id','loss']]
ceate_feature_map(features)
importance = bst.get_fscore(fmap='xgb.fmap')
# 从dist 转为 list
importance = sorted(importance.items(), key=operator.itemgetter(1))
# 给一个list 转化为带有 columns的DataFrame
df = pd.DataFrame(importance, columns=['feature', 'fscore'])
# 比值化
df['fscore'] = df['fscore'] / df['fscore'].sum()
df.to_csv("./feat_importance.csv", index=False)
plt.figure()
df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10))
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')
plt.show()