XGBoost & RandomForest特征选择

tiki_taka_

于 2022-01-04 19:30:03 发布

阅读量1.2k

点赞数

分类专栏：机器学习文章标签：决策树机器学习算法

本文链接：https://blog.csdn.net/qq_24729325/article/details/122309836

版权

机器学习专栏收录该内容

12 篇文章 0 订阅

订阅专栏

特征选择 xgboost 和RF 参数对比


XGBoost	解释	RF
max_depth	数最大深度	max_depth
n_estimators	树的个数，迭代次数	n_estimators
learning_rate	学习率
subsample	随机采样比例【百分比】样本建议决策树
colsample_bytree	随机采样比例【百分比】样本建议决策树
scale_pos_weight	解决样本个数不平衡的问题，正负样本比例【负:正】	class_weight
reg_lambda	L2正则化系数，默认为1
gamma	惩罚max_depth结点个数前的参数
	是否采用验证【袋外】样本评估模型好坏，默认是FALSE	oob_score
	模型最大特征数	max_features
	叶子结点最少样本数	min_samples_leaf
random_state	随机数	random_state

xgboost 调参以及筛选特征

import xgboost as xgb
from sklearn.model_selection import GridSearchCV

param_test = {
    # 'max_depth':range(5,15,1),
    # 'min_child_weight':range(1,6,2),
    # 'scale_pos_weight':[ i  for i in range(10,int(scale_pos_weight),10)],
    # 'gamma':[i / 10.0 for i in range(0,11,1)],
    'reg_lambda':[i / 10.0 for i in range(10,200,20)]
    
}
gsearch = GridSearchCV(
    estimator = XGBClassifier(
                    learning_rate = 0.1,
                    n_estimators = 140,
                    max_depth = 7,
                    min_child_weight = 1,
                    gamma = 0.7,
                    subsample = 0.8,
                    colsample_bytree = 0.8,
                    nthread = 4,
                    scale_pos_weight = 20,
                    seed = random_state
                ),
        param_grid = param_test,
        scoring ='roc_auc',
        cv = 5)
gsearch.fit(X, Y)

gsearch.best_params_

# 得到 所有 最优特征组合
model = XGBClassifier(
                    learning_rate = 0.1,
                    n_estimators = 140,
                    max_depth = 7,
                    min_child_weight = 1,
                    gamma = 0.7,
                    subsample = 0.8,
                    colsample_bytree = 0.8,
                    nthread = 4,
                    scale_pos_weight = 20,
                    seed = random_state
                )
# 输出重要程度
print(model.feature_importances_)

# plot feature importance using built-in function
from xgboost import plot_importance
plot_importance(model)
pyplot.show()

# feature_importances_排序
from matplotlib import pyplot
pyplot.bar(train_set.columns[1:], model.feature_importances_)
pyplot.show()
imp_score = pd.DataFrame(data=model.feature_importances_,index=train_set.columns[1:],columns=['score']).sort_values(by='score',ascending = False)
print(imp_score)
imp_score['score'][:6]


# 画图显示 前N个特征
import matplotlib.pyplot as plt
import seaboarn as sns
feature_length = len([k for k in train_set if k not in excluded_features])
not_used_features = [k for k in train_set if k not in excluded_features and k not in keys]
N = 80
ax2 = (imp_scores.iloc[0:N][::-1]
    .plot(kind='barh',
          color=sns_colors[0],
          title='Gain feature importances',
          figsize=(20,15)))
ax2.grid(False, axis='y')
plt.show()


# 通过feature_importances 筛选特征
from numpy import sort
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn import metrics
accuracy_max = 0
n_min = 0
threshshold_min = 0
# Fit model using each importance as a threshold
thresholds = sort(model.feature_importances_)
for thresh in thresholds:
    # select features using threshold
    selection = SelectFromModel(model, threshold=thresh, prefit=True)
    select_X_train = selection.transform(train_set[[k for k in train_set if k not in excluded_features]])
    # train model
    selection_model = XGBClassifier()
    selection_model.fit(select_X_train, train_set['label'])
    # eval model
    select_X_test = selection.transform(test_set[[k for k in test_set if k not in excluded_features]])
    y_pred = selection_model.predict(select_X_test)
    
    y_test = test_set['label']
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_test, predictions)
    fpr, tpr, _ = metrics.roc_curve(y_test, predictions)
    recall = metrics.recall_score(y_test, predictions)
    auc = metrics.auc(fpr, tpr)
    # print('fpr:{0}_tpr:{1}'.format(fpr,tpr))
    cm = metrics.confusion_matrix(y_test, predictions)

    TN = cm[0][0]
    FN = cm[1][0]
    TP = cm[1][1]
    FP = cm[0][1]
    # tpr = TP / (TP + FN)
    fpr = FP / (FP + TN)
    print('AUC:{0},fp:{1},tp:{2},recall:{3},fpr:{4}'.format(round(auc,5),FP,TP,round(recall,5),round(fpr,5)))
    # if accuracy_max > accuracy:
    #     continue
    # else:
    #     accuracy_max = accuracy
    #     n_min = select_X_train.shape[1]
    #     threshshold_min = thresh
    print("Thresh=%.3f, n=%d, AUC=%.3f, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1],auc,accuracy*100.0))
    
# 观察 在验证集上的表现【评估指标】，来筛选前几个特征。目的是 利用尽量少的特征，达到评估指标相对较好的表现。

N = 46 # 46个特征。
feature_columns = imp_score['score'].index[:46].tolist()

RandomForestClassifier 调参以及特征筛选

# 调参
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
param_test = {
    # 'max_depth':range(5,15,1),
    # 'n_estimators':range(150,300,10),
    # 'min_child_weight':range(1,6,2),
    # 'scale_pos_weight':[ i  for i in range(10,int(scale_pos_weight),10)],
    # 'gamma':[i / 10.0 for i in range(0,11,1)],
    # 'reg_lambda':[i / 10.0 for i in range(10,200,20)]
    'class_weight' :[{0:30,1:1},{0:40,1:1}] # 不平衡类别权重
}
gsearch = GridSearchCV(
    estimator = RandomForestClassifier(
                   # max_depth = 7 ,# 树  深度
                   #  n_estimators = 300 # 树 的 个数,即 迭代的次数
                   #  oob_score = True # 是否采用验证【袋外】样本评估模型好坏，默认是FALSE
                   #  max_features = 50 # 最大特征数
                   #  min_samples_split = 20 # 限制子树继续划分的条件，如果结点的样本数小于【该值】，不会尝试选择最优特征划分
                   #  min_samples_leaf = 20 # 叶子结点最少样本数
                   #  # class_weight = [{"0":30,"1":1},{"0":40,"1":1}] # 不平衡类别权重
                   #  seed = 7
                ),
        param_grid = param_test,
        scoring ='roc_auc',
        cv = 5)
gsearch.fit(train_set[[k for k in train_set if k not in excluded_features]], train_set["label"])

gsearch.best_params_

# 得到 所有 最优特征组合

model = RandomForestClassifier(
                    # max_depth = 7 ,# 树  深度
                   #  n_estimators = 300 # 树 的 个数,即 迭代的次数
                   #  oob_score = True # 是否采用验证【袋外】样本评估模型好坏，默认是FALSE
                   #  max_features = 50 # 最大特征数
                   #  min_samples_split = 20 # 限制子树继续划分的条件，如果结点的样本数小于【该值】，不会尝试选择最优特征划分
                   #  min_samples_leaf = 20 # 叶子结点最少样本数
                   #  # class_weight = [{"0":30,"1":1},{"0":40,"1":1}] # 不平衡类别权重
                   #  seed = 7,
                    seed = random_state
                )
model.fit(X,Y)
# 输出重要程度
print(model.feature_importances_)

# 特征显示
from matplotlib import pyplot
pyplot.bar(train_set.columns[1:], model.feature_importances_)
pyplot.show()
imp_score = pd.DataFrame(data=model.feature_importances_,index=train_set.columns[1:],columns=['score']).sort_values(by='score',ascending = False)
print(imp_score)
imp_score['score'][:6]

# 获取表现分
keys = list(imp_score.index)
values = list(imp_score['score'].values)

imp_scores = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)
# 画图显示 前N个特征
N = 80
ax2 = (imp_scores.iloc[0:N][::-1]
    .plot(kind='barh',
          color=sns_colors[0],
          title='RF feature importances',
          figsize=(20,15)))
ax2.grid(False, axis='y')
plt.show()


# 通过feature_importances 筛选特征
from numpy import sort
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn import metrics
accuracy_max = 0
n_min = 0
threshshold_min = 0
# Fit model using each importance as a threshold
thresholds = sort(model.feature_importances_)
for thresh in thresholds:
    # select features using threshold
    selection = SelectFromModel(model, threshold=thresh, prefit=True)
    select_X_train = selection.transform(train_set[[k for k in train_set if k not in excluded_features]])
    # train model
    selection_model = RandomForestClassifier()
    selection_model.fit(select_X_train, train_set['label'])
    # eval model
    select_X_test = selection.transform(test_set[[k for k in test_set if k not in excluded_features]])
    y_pred = selection_model.predict(select_X_test)
    
    y_test = test_set['label']
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_test, predictions)
    fpr, tpr, _ = metrics.roc_curve(y_test, predictions)
    recall = metrics.recall_score(y_test, predictions)
    auc = metrics.auc(fpr, tpr)
    # print('fpr:{0}_tpr:{1}'.format(fpr,tpr))
    cm = metrics.confusion_matrix(y_test, predictions)

    TN = cm[0][0]
    FN = cm[1][0]
    TP = cm[1][1]
    FP = cm[0][1]
    # tpr = TP / (TP + FN)
    fpr = FP / (FP + TN)
    print('AUC:{0},fp:{1},tp:{2},recall:{3},fpr:{4}'.format(round(auc,5),FP,TP,round(recall,5),round(fpr,5)))
    # if accuracy_max > accuracy:
    #     continue
    # else:
    #     accuracy_max = accuracy
    #     n_min = select_X_train.shape[1]
    #     threshshold_min = thresh
    print("Thresh=%.3f, n=%d, AUC=%.3f, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1],auc,accuracy*100.0))
# 观察 在验证集上的表现【评估指标】，来筛选前几个特征。目的是 利用尽量少的特征，达到评估指标相对较好的表现。

N = 46 # 46个特征。
feature_columns = imp_score['score'].index[:46].tolist()