机器学习算法--python实现采用多数票机制的集成分类器

构建的集成分类器,它通常比任何单个成员的预测性能都要好。
把不同的分类算法及其各自相应的权重组合起来。目标是建立一个更强大的超级分类器,以平衡单个分类器在特定数据集上的弱点。

import numpy as np
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.preprocessing import LabelEncoder
# from sklearn.externals import six
import six
import sys
sys.modules['sklearn.externals.six'] = six

from sklearn.base import clone
from sklearn.pipeline import _name_estimators
import operator
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from itertools import product
from sklearn.model_selection import GridSearchCV

# 用父类 BaseEstimator和ClassifierMixin轻而易举地获得了一些基本功能,
# 这包括设置和返回分类器参数的get_params和set_params方法,
# 以及计算预测准确度的score方法。
class MajorityVoteClassifier(BaseEstimator, ClassifierMixin):
    """ A majority vote ensemble classifier

    Parameters
    ----------
    classifiers : array-like, shape = [n_classifiers]
      Different classifiers for the ensemble

    vote : str, {'classlabel', 'probability'} (default='label')
      If 'classlabel' the prediction is based on the argmax of
        class labels. Else if 'probability', the argmax of
        the sum of probabilities is used to predict the class label
        (recommended for calibrated classifiers).

    weights : array-like, shape = [n_classifiers], optional (default=None)
      If a list of `int` or `float` values are provided, the classifiers
      are weighted by importance; Uses uniform weights if `weights=None`.

    """
    def __init__(self, classifiers, vote='classlabel', weights=None):

        self.classifiers = classifiers
        # 使用_name_estimators函数访问集成分类器中每个成员分类器的参数
        self.named_classifiers = {key: value for key, value
                                  in _name_estimators(classifiers)}
        # print("_name_estimators(classifiers):", _name_estimators(classifiers))
        # print(self.named_classifiers)
        self.vote = vote
        self.weights = weights

    def fit(self, X, y):
        """ Fit classifiers.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Matrix of training samples.

        y : array-like, shape = [n_samples]
            Vector of target class labels.

        Returns
        -------
        self : object

        """
        if self.vote not in ('probability', 'classlabel'):
            raise ValueError("vote must be 'probability' or 'classlabel'"
                             "; got (vote=%r)"
                             % self.vote)

        if self.weights and len(self.weights) != len(self.classifiers):
            raise ValueError('Number of classifiers and weights must be equal'
                             '; got %d weights, %d classifiers'
                             % (len(self.weights), len(self.classifiers)))

        # Use LabelEncoder to ensure class labels start with 0, which
        # is important for np.argmax call in self.predict
        self.lablenc_ = LabelEncoder()
        self.lablenc_.fit(y)
        self.classes_ = self.lablenc_.classes_
        self.classifiers_ = []
        for clf in self.classifiers:
            fitted_clf = clone(clf).fit(X, self.lablenc_.transform(y))
            self.classifiers_.append(fitted_clf)
        return self

    def predict(self, X):
        """ Predict class labels for X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Matrix of training samples.

        Returns
        ----------
        maj_vote : array-like, shape = [n_samples]
            Predicted class labels.

        """
        if self.vote == 'probability':
            maj_vote = np.argmax(self.predict_proba(X), axis=1)
            # print("maj_vote:", maj_vote)
        else:  # 'classlabel' vote

            #  Collect results from clf.predict calls
            # np.array (默认情况下)将会copy该对象,而 np.asarray 除非必要,否则不会copy该对象。
            predictions = np.asarray([clf.predict(X) for clf in self.classifiers_]).T
            # print("predictions:", predictions)
            # 将arr数组的每一个元素经过func函数变换形成的一个新数组
            maj_vote = np.apply_along_axis(
                lambda x: np.argmax(np.bincount(x, weights=self.weights)),
                axis=1,
                arr=predictions)
        # print("转化前maj_vote:", maj_vote)
        maj_vote = self.lablenc_.inverse_transform(maj_vote)
        # print("转化后的maj_vote:", maj_vote)
        return maj_vote

    def predict_proba(self, X):
        """ Predict class probabilities for X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.

        Returns
        ----------
        avg_proba : array-like, shape = [n_samples, n_classes]
            Weighted average probability for each class per sample.

        """
        probas = np.asarray([clf.predict_proba(X) for clf in self.classifiers_])
        avg_proba = np.average(probas, axis=0, weights=self.weights)
        # print("probas:", probas)
        # print("avg_proba:", avg_proba)
        return avg_proba

    def get_params(self, deep=True):
        """ Get classifier parameter names for GridSearch"""
        if not deep:
            return super(MajorityVoteClassifier, self).get_params(deep=False)
        else:
            out = self.named_classifiers.copy()
            for name, step in six.iteritems(self.named_classifiers):
                for key, value in six.iteritems(step.get_params(deep=True)):
                    out['%s__%s' % (name, key)] = value
            return out

iris = datasets.load_iris()
X, y = iris.data[50:, [1, 2]], iris.target[50:]
# print("y:", y)
le = LabelEncoder()
y = le.fit_transform(y)
# print("y:", y)

X_train, X_test, y_train, y_test = train_test_split(X, y,
                        test_size=0.5,
                        random_state=1,
                        stratify=y)

clf1 = LogisticRegression(penalty='l2', C=0.001, random_state=1)
'''
criterion这个参数正是用来决定不纯度的计算方法。sklearn提供了两种选择:
1)输入”entropy“,使用信息熵
2)输入”gini“,使用基尼系数
'''
clf2 = DecisionTreeClassifier(max_depth=1, criterion='entropy', random_state=0)

# metric:用于树的距离度量。默认'minkowski与P = 2(即欧氏度量)
clf3 = KNeighborsClassifier(n_neighbors=1, p=2, metric='minkowski')


pipe1 = Pipeline([['sc', StandardScaler()],
                  ['clf', clf1]])
pipe3 = Pipeline([['sc', StandardScaler()],
                  ['clf', clf3]])

clf_labels = ['Logistic regression', 'Decision tree', 'KNN']

# 在将它们构建成集成分类器之前,先通过对训练集进行10次交叉验证以评估每个分类器模型的性能
# print('10-fold cross validation:\n')
# for clf, label in zip([pipe1, clf2, pipe3], clf_labels):
#     scores = cross_val_score(estimator=clf,
#                              X=X_train,
#                              y=y_train,
#                              cv=10,
#                              scoring='roc_auc')
#     print("ROC AUC: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

# Majority Rule (hard) Voting

mv_clf = MajorityVoteClassifier(classifiers=[pipe1, clf2, pipe3])
y_pred_ = mv_clf.fit(X_train, y_train).predict(X_test)
print("y_pred_:", y_pred_)
print("get_params():", mv_clf.get_params())
print("get_params(deep=False):", mv_clf.get_params(deep=False))

clf_labels += ['Majority voting']
all_clf = [pipe1, clf2, pipe3, mv_clf]

# 可以看到各个分类器的MajorityVotingClassifier的性能已经在有10个分区的交叉验证评估上得到改善
# for clf, label in zip(all_clf, clf_labels):
#     scores = cross_val_score(estimator=clf,
#                              X=X_train,
#                              y=y_train,
#                              cv=10,
#                              scoring='roc_auc')
#     print("ROC AUC: %0.2f (+/- %0.2f) [%s]"
#           % (scores.mean(), scores.std(), label))

# # Evaluating and tuning the ensemble classifier
# ROC结果图
# 基于测试数据来计算ROC曲线,以检查MajorityVoteClassifier对未见过的数据是否泛化良好。
colors = ['black', 'orange', 'blue', 'green']
linestyles = [':', '--', '-.', '-']
for clf, label, clr, ls in zip(all_clf, clf_labels, colors, linestyles):
    # assuming the label of the positive class is 1
    y_pred = clf.fit(X_train, y_train).predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_true=y_test,
                                     y_score=y_pred)
    roc_auc = auc(x=fpr, y=tpr)  # 计算曲线下面积
    plt.plot(fpr, tpr,
             color=clr,
             linestyle=ls,
             label='%s (auc = %0.2f)' % (label, roc_auc))

plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1],
         linestyle='--',
         color='gray',
         linewidth=2)

plt.xlim([-0.1, 1.1])
plt.ylim([-0.1, 1.1])
plt.grid(alpha=0.5)
plt.xlabel('False positive rate (FPR)')
plt.ylabel('True positive rate (TPR)')


#plt.savefig('images/07_04', dpi=300)
plt.show()

# 看看集成分类器决策区域
# 集成分类器决策区域似乎是单个分类器决 策区域的混合体
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)

all_clf = [pipe1, clf2, pipe3, mv_clf]

x_min = X_train_std[:, 0].min() - 1
x_max = X_train_std[:, 0].max() + 1
y_min = X_train_std[:, 1].min() - 1
y_max = X_train_std[:, 1].max() + 1

xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                     np.arange(y_min, y_max, 0.1))

f, axarr = plt.subplots(nrows=2, ncols=2,
                        sharex='col',
                        sharey='row',
                        figsize=(7, 5))

for idx, clf, tt in zip(product([0, 1], [0, 1]),
                        all_clf, clf_labels):
    clf.fit(X_train_std, y_train)

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    axarr[idx[0], idx[1]].contourf(xx, yy, Z, alpha=0.3)

    axarr[idx[0], idx[1]].scatter(X_train_std[y_train == 0, 0],
                                  X_train_std[y_train == 0, 1],
                                  c='blue',
                                  marker='^',
                                  s=50)

    axarr[idx[0], idx[1]].scatter(X_train_std[y_train == 1, 0],
                                  X_train_std[y_train == 1, 1],
                                  c='green',
                                  marker='o',
                                  s=50)

    axarr[idx[0], idx[1]].set_title(tt)


# plt.savefig('images/07_05', dpi=300)
plt.show()
# 访问单个分类器的属性
print(mv_clf.get_params())

# 通过网格搜索来优化逻辑回归分类器的逆正则化 参数C和决策树深度
params = {'decisiontreeclassifier__max_depth': [1, 2],
          'pipeline-1__clf__C': [0.001, 0.1, 100.0]}

grid = GridSearchCV(estimator=mv_clf,
                    param_grid=params,
                    cv=10,
                    scoring='roc_auc')
grid.fit(X_train, y_train)

# 网格搜索完成后,可以列出不同的超参数值组合,以及通过10个分区的交叉验证计算所获得ROC AUC的平均评分
for r, _ in enumerate(grid.cv_results_['mean_test_score']):
    print("%0.3f +/- %0.2f %r"
          % (grid.cv_results_['mean_test_score'][r],
             grid.cv_results_['std_test_score'][r] / 2.0,
             grid.cv_results_['params'][r]))

print('Best parameters: %s' % grid.best_params_)
print('Accuracy: %.2f' % grid.best_score_)

运行结果:
y_pred_: [0 0 0 0 0 1 1 1 1 0 0 1 0 1 1 1 1 0 1 0 1 0 0 0 1 0 1 0 0 1 1 0 0 1 0 1 0
1 1 0 1 0 0 1 0 0 1 0 0 0]
get_params(): {‘pipeline-1’: Pipeline(steps=[[‘sc’, StandardScaler()],
[‘clf’, LogisticRegression(C=0.001, random_state=1)]]), ‘decisiontreeclassifier’: DecisionTreeClassifier(criterion=‘entropy’, max_depth=1, random_state=0), ‘pipeline-2’: Pipeline(steps=[[‘sc’, StandardScaler()],
[‘clf’, KNeighborsClassifier(n_neighbors=1)]]), ‘pipeline-1__memory’: None, ‘pipeline-1__steps’: [[‘sc’, StandardScaler()], [‘clf’, LogisticRegression(C=0.001, random_state=1)]], ‘pipeline-1__verbose’: False, ‘pipeline-1__sc’: StandardScaler(), ‘pipeline-1__clf’: LogisticRegression(C=0.001, random_state=1), ‘pipeline-1__sc__copy’: True, ‘pipeline-1__sc__with_mean’: True, ‘pipeline-1__sc__with_std’: True, ‘pipeline-1__clf__C’: 0.001, ‘pipeline-1__clf__class_weight’: None, ‘pipeline-1__clf__dual’: False, ‘pipeline-1__clf__fit_intercept’: True, ‘pipeline-1__clf__intercept_scaling’: 1, ‘pipeline-1__clf__l1_ratio’: None, ‘pipeline-1__clf__max_iter’: 100, ‘pipeline-1__clf__multi_class’: ‘auto’, ‘pipeline-1__clf__n_jobs’: None, ‘pipeline-1__clf__penalty’: ‘l2’, ‘pipeline-1__clf__random_state’: 1, ‘pipeline-1__clf__solver’: ‘lbfgs’, ‘pipeline-1__clf__tol’: 0.0001, ‘pipeline-1__clf__verbose’: 0, ‘pipeline-1__clf__warm_start’: False, ‘decisiontreeclassifier__ccp_alpha’: 0.0, ‘decisiontreeclassifier__class_weight’: None, ‘decisiontreeclassifier__criterion’: ‘entropy’, ‘decisiontreeclassifier__max_depth’: 1, ‘decisiontreeclassifier__max_features’: None, ‘decisiontreeclassifier__max_leaf_nodes’: None, ‘decisiontreeclassifier__min_impurity_decrease’: 0.0, ‘decisiontreeclassifier__min_impurity_split’: None, ‘decisiontreeclassifier__min_samples_leaf’: 1, ‘decisiontreeclassifier__min_samples_split’: 2, ‘decisiontreeclassifier__min_weight_fraction_leaf’: 0.0, ‘decisiontreeclassifier__presort’: ‘deprecated’, ‘decisiontreeclassifier__random_state’: 0, ‘decisiontreeclassifier__splitter’: ‘best’, ‘pipeline-2__memory’: None, ‘pipeline-2__steps’: [[‘sc’, StandardScaler()], [‘clf’, KNeighborsClassifier(n_neighbors=1)]], ‘pipeline-2__verbose’: False, ‘pipeline-2__sc’: StandardScaler(), ‘pipeline-2__clf’: KNeighborsClassifier(n_neighbors=1), ‘pipeline-2__sc__copy’: True, ‘pipeline-2__sc__with_mean’: True, ‘pipeline-2__sc__with_std’: True, ‘pipeline-2__clf__algorithm’: ‘auto’, ‘pipeline-2__clf__leaf_size’: 30, ‘pipeline-2__clf__metric’: ‘minkowski’, ‘pipeline-2__clf__metric_params’: None, ‘pipeline-2__clf__n_jobs’: None, ‘pipeline-2__clf__n_neighbors’: 1, ‘pipeline-2__clf__p’: 2, ‘pipeline-2__clf__weights’: ‘uniform’}
get_params(deep=False): {‘classifiers’: [Pipeline(steps=[[‘sc’, StandardScaler()],
[‘clf’, LogisticRegression(C=0.001, random_state=1)]]), DecisionTreeClassifier(criterion=‘entropy’, max_depth=1, random_state=0), Pipeline(steps=[[‘sc’, StandardScaler()],
[‘clf’, KNeighborsClassifier(n_neighbors=1)]])], ‘vote’: ‘classlabel’, ‘weights’: None}
{‘pipeline-1’: Pipeline(steps=[(‘sc’, StandardScaler()),
[‘clf’, LogisticRegression(C=0.001, random_state=1)]]), ‘decisiontreeclassifier’: DecisionTreeClassifier(criterion=‘entropy’, max_depth=1, random_state=0), ‘pipeline-2’: Pipeline(steps=[(‘sc’, StandardScaler()),
[‘clf’, KNeighborsClassifier(n_neighbors=1)]]), ‘pipeline-1__memory’: None, ‘pipeline-1__steps’: [(‘sc’, StandardScaler()), [‘clf’, LogisticRegression(C=0.001, random_state=1)]], ‘pipeline-1__verbose’: False, ‘pipeline-1__sc’: StandardScaler(), ‘pipeline-1__clf’: LogisticRegression(C=0.001, random_state=1), ‘pipeline-1__sc__copy’: True, ‘pipeline-1__sc__with_mean’: True, ‘pipeline-1__sc__with_std’: True, ‘pipeline-1__clf__C’: 0.001, ‘pipeline-1__clf__class_weight’: None, ‘pipeline-1__clf__dual’: False, ‘pipeline-1__clf__fit_intercept’: True, ‘pipeline-1__clf__intercept_scaling’: 1, ‘pipeline-1__clf__l1_ratio’: None, ‘pipeline-1__clf__max_iter’: 100, ‘pipeline-1__clf__multi_class’: ‘auto’, ‘pipeline-1__clf__n_jobs’: None, ‘pipeline-1__clf__penalty’: ‘l2’, ‘pipeline-1__clf__random_state’: 1, ‘pipeline-1__clf__solver’: ‘lbfgs’, ‘pipeline-1__clf__tol’: 0.0001, ‘pipeline-1__clf__verbose’: 0, ‘pipeline-1__clf__warm_start’: False, ‘decisiontreeclassifier__ccp_alpha’: 0.0, ‘decisiontreeclassifier__class_weight’: None, ‘decisiontreeclassifier__criterion’: ‘entropy’, ‘decisiontreeclassifier__max_depth’: 1, ‘decisiontreeclassifier__max_features’: None, ‘decisiontreeclassifier__max_leaf_nodes’: None, ‘decisiontreeclassifier__min_impurity_decrease’: 0.0, ‘decisiontreeclassifier__min_impurity_split’: None, ‘decisiontreeclassifier__min_samples_leaf’: 1, ‘decisiontreeclassifier__min_samples_split’: 2, ‘decisiontreeclassifier__min_weight_fraction_leaf’: 0.0, ‘decisiontreeclassifier__presort’: ‘deprecated’, ‘decisiontreeclassifier__random_state’: 0, ‘decisiontreeclassifier__splitter’: ‘best’, ‘pipeline-2__memory’: None, ‘pipeline-2__steps’: [(‘sc’, StandardScaler()), [‘clf’, KNeighborsClassifier(n_neighbors=1)]], ‘pipeline-2__verbose’: False, ‘pipeline-2__sc’: StandardScaler(), ‘pipeline-2__clf’: KNeighborsClassifier(n_neighbors=1), ‘pipeline-2__sc__copy’: True, ‘pipeline-2__sc__with_mean’: True, ‘pipeline-2__sc__with_std’: True, ‘pipeline-2__clf__algorithm’: ‘auto’, ‘pipeline-2__clf__leaf_size’: 30, ‘pipeline-2__clf__metric’: ‘minkowski’, ‘pipeline-2__clf__metric_params’: None, ‘pipeline-2__clf__n_jobs’: None, ‘pipeline-2__clf__n_neighbors’: 1, ‘pipeline-2__clf__p’: 2, ‘pipeline-2__clf__weights’: ‘uniform’}
0.983 +/- 0.02 {‘decisiontreeclassifier__max_depth’: 1, ‘pipeline-1__clf__C’: 0.001}
0.983 +/- 0.02 {‘decisiontreeclassifier__max_depth’: 1, ‘pipeline-1__clf__C’: 0.1}
0.967 +/- 0.05 {‘decisiontreeclassifier__max_depth’: 1, ‘pipeline-1__clf__C’: 100.0}
0.983 +/- 0.02 {‘decisiontreeclassifier__max_depth’: 2, ‘pipeline-1__clf__C’: 0.001}
0.983 +/- 0.02 {‘decisiontreeclassifier__max_depth’: 2, ‘pipeline-1__clf__C’: 0.1}
0.967 +/- 0.05 {‘decisiontreeclassifier__max_depth’: 2, ‘pipeline-1__clf__C’: 100.0}
Best parameters: {‘decisiontreeclassifier__max_depth’: 1, ‘pipeline-1__clf__C’: 0.001}
Accuracy: 0.98

备注:scikit-learn实现了更复杂的多数票分类器。可以从scikit-learn 0.17 或者更新版本中找到sklearn.ensemble.VotingClassifier。

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值