XGBoost & RandomForest特征选择

特征选择 xgboost 和RF 参数对比

XGBoost解释RF
max_depth数最大深度max_depth
n_estimators树的个数,迭代次数n_estimators
learning_rate学习率
subsample随机 采样 比例【百分比】样本建议决策树
colsample_bytree随机 采样 比例【百分比】样本建议决策树
scale_pos_weight解决样本个数 不平衡的问题,正负样本比例【负:正】class_weight
reg_lambdaL2正则化系数,默认为1
gamma惩罚max_depth结点个数 前的参数
是否采用验证【袋外】样本评估模型好坏,默认是FALSEoob_score
模型最大特征数max_features
叶子结点最少样本数min_samples_leaf
random_state随机数random_state

xgboost 调参以及筛选特征

import xgboost as xgb
from sklearn.model_selection import GridSearchCV

param_test = {
    # 'max_depth':range(5,15,1),
    # 'min_child_weight':range(1,6,2),
    # 'scale_pos_weight':[ i  for i in range(10,int(scale_pos_weight),10)],
    # 'gamma':[i / 10.0 for i in range(0,11,1)],
    'reg_lambda':[i / 10.0 for i in range(10,200,20)]
    
}
gsearch = GridSearchCV(
    estimator = XGBClassifier(
                    learning_rate = 0.1,
                    n_estimators = 140,
                    max_depth = 7,
                    min_child_weight = 1,
                    gamma = 0.7,
                    subsample = 0.8,
                    colsample_bytree = 0.8,
                    nthread = 4,
                    scale_pos_weight = 20,
                    seed = random_state
                ),
        param_grid = param_test,
        scoring ='roc_auc',
        cv = 5)
gsearch.fit(X, Y)

gsearch.best_params_

# 得到 所有 最优特征组合
model = XGBClassifier(
                    learning_rate = 0.1,
                    n_estimators = 140,
                    max_depth = 7,
                    min_child_weight = 1,
                    gamma = 0.7,
                    subsample = 0.8,
                    colsample_bytree = 0.8,
                    nthread = 4,
                    scale_pos_weight = 20,
                    seed = random_state
                )
# 输出重要程度
print(model.feature_importances_)

# plot feature importance using built-in function
from xgboost import plot_importance
plot_importance(model)
pyplot.show()

# feature_importances_排序
from matplotlib import pyplot
pyplot.bar(train_set.columns[1:], model.feature_importances_)
pyplot.show()
imp_score = pd.DataFrame(data=model.feature_importances_,index=train_set.columns[1:],columns=['score']).sort_values(by='score',ascending = False)
print(imp_score)
imp_score['score'][:6]


# 画图显示 前N个特征
import matplotlib.pyplot as plt
import seaboarn as sns
feature_length = len([k for k in train_set if k not in excluded_features])
not_used_features = [k for k in train_set if k not in excluded_features and k not in keys]
N = 80
ax2 = (imp_scores.iloc[0:N][::-1]
    .plot(kind='barh',
          color=sns_colors[0],
          title='Gain feature importances',
          figsize=(20,15)))
ax2.grid(False, axis='y')
plt.show()


# 通过feature_importances 筛选特征
from numpy import sort
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn import metrics
accuracy_max = 0
n_min = 0
threshshold_min = 0
# Fit model using each importance as a threshold
thresholds = sort(model.feature_importances_)
for thresh in thresholds:
    # select features using threshold
    selection = SelectFromModel(model, threshold=thresh, prefit=True)
    select_X_train = selection.transform(train_set[[k for k in train_set if k not in excluded_features]])
    # train model
    selection_model = XGBClassifier()
    selection_model.fit(select_X_train, train_set['label'])
    # eval model
    select_X_test = selection.transform(test_set[[k for k in test_set if k not in excluded_features]])
    y_pred = selection_model.predict(select_X_test)
    
    y_test = test_set['label']
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_test, predictions)
    fpr, tpr, _ = metrics.roc_curve(y_test, predictions)
    recall = metrics.recall_score(y_test, predictions)
    auc = metrics.auc(fpr, tpr)
    # print('fpr:{0}_tpr:{1}'.format(fpr,tpr))
    cm = metrics.confusion_matrix(y_test, predictions)

    TN = cm[0][0]
    FN = cm[1][0]
    TP = cm[1][1]
    FP = cm[0][1]
    # tpr = TP / (TP + FN)
    fpr = FP / (FP + TN)
    print('AUC:{0},fp:{1},tp:{2},recall:{3},fpr:{4}'.format(round(auc,5),FP,TP,round(recall,5),round(fpr,5)))
    # if accuracy_max > accuracy:
    #     continue
    # else:
    #     accuracy_max = accuracy
    #     n_min = select_X_train.shape[1]
    #     threshshold_min = thresh
    print("Thresh=%.3f, n=%d, AUC=%.3f, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1],auc,accuracy*100.0))
    
# 观察 在验证集上的表现【评估指标】,来筛选前几个特征。目的是 利用尽量少的特征,达到评估指标相对较好的表现。

N = 46 # 46个特征。
feature_columns = imp_score['score'].index[:46].tolist() 

RandomForestClassifier 调参以及特征筛选

# 调参
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
param_test = {
    # 'max_depth':range(5,15,1),
    # 'n_estimators':range(150,300,10),
    # 'min_child_weight':range(1,6,2),
    # 'scale_pos_weight':[ i  for i in range(10,int(scale_pos_weight),10)],
    # 'gamma':[i / 10.0 for i in range(0,11,1)],
    # 'reg_lambda':[i / 10.0 for i in range(10,200,20)]
    'class_weight' :[{0:30,1:1},{0:40,1:1}] # 不平衡类别权重
}
gsearch = GridSearchCV(
    estimator = RandomForestClassifier(
                   # max_depth = 7 ,# 树  深度
                   #  n_estimators = 300 # 树 的 个数,即 迭代的次数
                   #  oob_score = True # 是否采用验证【袋外】样本评估模型好坏,默认是FALSE
                   #  max_features = 50 # 最大特征数
                   #  min_samples_split = 20 # 限制子树继续划分的条件,如果结点的样本数小于【该值】,不会尝试选择最优特征划分
                   #  min_samples_leaf = 20 # 叶子结点最少样本数
                   #  # class_weight = [{"0":30,"1":1},{"0":40,"1":1}] # 不平衡类别权重
                   #  seed = 7
                ),
        param_grid = param_test,
        scoring ='roc_auc',
        cv = 5)
gsearch.fit(train_set[[k for k in train_set if k not in excluded_features]], train_set["label"])

gsearch.best_params_

# 得到 所有 最优特征组合

model = RandomForestClassifier(
                    # max_depth = 7 ,# 树  深度
                   #  n_estimators = 300 # 树 的 个数,即 迭代的次数
                   #  oob_score = True # 是否采用验证【袋外】样本评估模型好坏,默认是FALSE
                   #  max_features = 50 # 最大特征数
                   #  min_samples_split = 20 # 限制子树继续划分的条件,如果结点的样本数小于【该值】,不会尝试选择最优特征划分
                   #  min_samples_leaf = 20 # 叶子结点最少样本数
                   #  # class_weight = [{"0":30,"1":1},{"0":40,"1":1}] # 不平衡类别权重
                   #  seed = 7,
                    seed = random_state
                )
model.fit(X,Y)
# 输出重要程度
print(model.feature_importances_)

# 特征显示
from matplotlib import pyplot
pyplot.bar(train_set.columns[1:], model.feature_importances_)
pyplot.show()
imp_score = pd.DataFrame(data=model.feature_importances_,index=train_set.columns[1:],columns=['score']).sort_values(by='score',ascending = False)
print(imp_score)
imp_score['score'][:6]

# 获取表现分
keys = list(imp_score.index)
values = list(imp_score['score'].values)

imp_scores = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)
# 画图显示 前N个特征
N = 80
ax2 = (imp_scores.iloc[0:N][::-1]
    .plot(kind='barh',
          color=sns_colors[0],
          title='RF feature importances',
          figsize=(20,15)))
ax2.grid(False, axis='y')
plt.show()


# 通过feature_importances 筛选特征
from numpy import sort
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn import metrics
accuracy_max = 0
n_min = 0
threshshold_min = 0
# Fit model using each importance as a threshold
thresholds = sort(model.feature_importances_)
for thresh in thresholds:
    # select features using threshold
    selection = SelectFromModel(model, threshold=thresh, prefit=True)
    select_X_train = selection.transform(train_set[[k for k in train_set if k not in excluded_features]])
    # train model
    selection_model = RandomForestClassifier()
    selection_model.fit(select_X_train, train_set['label'])
    # eval model
    select_X_test = selection.transform(test_set[[k for k in test_set if k not in excluded_features]])
    y_pred = selection_model.predict(select_X_test)
    
    y_test = test_set['label']
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_test, predictions)
    fpr, tpr, _ = metrics.roc_curve(y_test, predictions)
    recall = metrics.recall_score(y_test, predictions)
    auc = metrics.auc(fpr, tpr)
    # print('fpr:{0}_tpr:{1}'.format(fpr,tpr))
    cm = metrics.confusion_matrix(y_test, predictions)

    TN = cm[0][0]
    FN = cm[1][0]
    TP = cm[1][1]
    FP = cm[0][1]
    # tpr = TP / (TP + FN)
    fpr = FP / (FP + TN)
    print('AUC:{0},fp:{1},tp:{2},recall:{3},fpr:{4}'.format(round(auc,5),FP,TP,round(recall,5),round(fpr,5)))
    # if accuracy_max > accuracy:
    #     continue
    # else:
    #     accuracy_max = accuracy
    #     n_min = select_X_train.shape[1]
    #     threshshold_min = thresh
    print("Thresh=%.3f, n=%d, AUC=%.3f, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1],auc,accuracy*100.0))
# 观察 在验证集上的表现【评估指标】,来筛选前几个特征。目的是 利用尽量少的特征,达到评估指标相对较好的表现。

N = 46 # 46个特征。
feature_columns = imp_score['score'].index[:46].tolist()    
  • 0
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

tiki_taka_

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值