# Implementing a simple majority vote classifer

start with a warm-up exercise and implement a simple ensemble classifer for majority voting in Python.

Although the following algorithm also generalizes to multi-class settings via plurality voting, we will use the term majority voting for simplicity as is also often done in literature .

1. Averaged prediction

np.argmax(np.bincount([0, 0, 1],
weights=[0.2, 0.2, 0.6]))

ex = np.array([[0.9, 0.1],
[0.8, 0.2],
[0.4, 0.6]])

p = np.average(ex,
axis=0,
weights=[0.2, 0.2, 0.6])
print('Averaged prediction', p)
print('np.argmax(p): ', np.argmax(p))

('Averaged prediction', array([ 0.58,  0.42]))
('np.argmax(p): ', 0)

2. Implement a MajorityVoteClassifier

from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.externals import six
from sklearn.base import clone
from sklearn.pipeline import _name_estimators
import numpy as np
import operator
class MajorityVoteClassifier(BaseEstimator, ClassifierMixin):
""" A majority vote ensemble classifier

Parameters
----------
classifiers : array-like, shape = [n_classifiers]
Different classifiers for the ensemble

vote : str, {'classlabel', 'probability'} (default='label')
If 'classlabel' the prediction is based on the argmax of
class labels. Else if 'probability', the argmax of
the sum of probabilities is used to predict the class label
(recommended for calibrated classifiers).

weights : array-like, shape = [n_classifiers], optional (default=None)
If a list of int or float values are provided, the classifiers
are weighted by importance; Uses uniform weights if weights=None.

"""
def __init__(self, classifiers, vote='classlabel', weights=None):

self.classifiers = classifiers
self.named_classifiers = {key: value for key, value
in _name_estimators(classifiers)}
self.vote = vote
self.weights = weights

def fit(self, X, y):
""" Fit classifiers.

Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Matrix of training samples.

y : array-like, shape = [n_samples]
Vector of target class labels.

Returns
-------
self : object

"""
if self.vote not in ('probability', 'classlabel'):
raise ValueError("vote must be 'probability' or 'classlabel'"
"; got (vote=%r)"
% self.vote)

if self.weights and len(self.weights) != len(self.classifiers):
raise ValueError('Number of classifiers and weights must be equal'
'; got %d weights, %d classifiers'
% (len(self.weights), len(self.classifiers)))

# is important for np.argmax call in self.predict
self.lablenc_ = LabelEncoder()
self.lablenc_.fit(y)
self.classes_ = self.lablenc_.classes_
self.classifiers_ = []
for clf in self.classifiers:
fitted_clf = clone(clf).fit(X, self.lablenc_.transform(y))
self.classifiers_.append(fitted_clf)
return self

def predict(self, X):
""" Predict class labels for X.

Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Matrix of training samples.

Returns
----------
maj_vote : array-like, shape = [n_samples]
Predicted class labels.

"""
if self.vote == 'probability':
maj_vote = np.argmax(self.predict_proba(X), axis=1)
else:  # 'classlabel' vote

#  Collect results from clf.predict calls
predictions = np.asarray([clf.predict(X)
for clf in self.classifiers_]).T

maj_vote = np.apply_along_axis(
lambda x:
np.argmax(np.bincount(x,
weights=self.weights)),
axis=1,
arr=predictions)
maj_vote = self.lablenc_.inverse_transform(maj_vote)
return maj_vote

def predict_proba(self, X):
""" Predict class probabilities for X.

Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples and
n_features is the number of features.

Returns
----------
avg_proba : array-like, shape = [n_samples, n_classes]
Weighted average probability for each class per sample.

"""
probas = np.asarray([clf.predict_proba(X)
for clf in self.classifiers_])
avg_proba = np.average(probas, axis=0, weights=self.weights)
return avg_proba

def get_params(self, deep=True):
""" Get classifier parameter names for GridSearch"""
if not deep:
return super(MajorityVoteClassifier, self).get_params(deep=False)
else:
out = self.named_classifiers.copy()
for name, step in six.iteritems(self.named_classifiers):
for key, value in six.iteritems(step.get_params(deep=True)):
out['%s__%s' % (name, key)] = value
return out
3. Combining different algorithms for classifcation with majority vote
3.1 split the Iris samples into 50 percent training and 50 percent test data

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
iris = datasets.load_iris()X, y = iris.data[50:, [1, 2]], iris.target[50:]le = LabelEncoder()y = le.fit_transform(y)X_train, X_test, y_train, y_test =\ train_test_split(X, y, test_size=0.5, random_state=1) 3.2 now will train three different classifers—a logistic regression classifer, a decision tree classifer, and a k-nearest neighbors classifer—and look at their individual performances via a 10-fold cross-validation on the training dataset before we combine them into an ensemble classifer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
clf1 = LogisticRegression(penalty='l2',
C=0.001,
random_state=0)

clf2 = DecisionTreeClassifier(max_depth=1,
criterion='entropy',
random_state=0)

clf3 = KNeighborsClassifier(n_neighbors=1,
p=2,
metric='minkowski')

pipe1 = Pipeline([['sc', StandardScaler()],
['clf', clf1]])
pipe3 = Pipeline([['sc', StandardScaler()],
['clf', clf3]])

clf_labels = ['Logistic Regression', 'Decision Tree', 'KNN']

print('10-fold cross validation:\n')
for clf, label in zip([pipe1, clf2, pipe3], clf_labels):
scores = cross_val_score(estimator=clf,
X=X_train,
y=y_train,
cv=10,
scoring='roc_auc')
print("ROC AUC: %0.2f (+/- %0.2f) [%s]"
% (scores.mean(), scores.std(), label))
10-fold cross validation:
ROC AUC: 0.92 (+/- 0.20) [Logistic Regression]
ROC AUC: 0.92 (+/- 0.15) [Decision Tree]
ROC AUC: 0.93 (+/- 0.10) [KNN]

3.3 Now move on to the more exciting part and combine the individual classifers for majority rule voting in  MajorityVoteClassifier.

mv_clf = MajorityVoteClassifier(classifiers=[pipe1, clf2, pipe3])

clf_labels += ['Majority Voting']
all_clf = [pipe1, clf2, pipe3, mv_clf]

for clf, label in zip(all_clf, clf_labels):
scores = cross_val_score(estimator=clf,
X=X_train,
y=y_train,
cv=10,
scoring='roc_auc')
print("ROC AUC: %0.2f (+/- %0.2f) [%s]"
% (scores.mean(), scores.std(), label))

ROC AUC: 0.92 (+/- 0.20) [Logistic Regression]
ROC AUC: 0.92 (+/- 0.15) [Decision Tree]
ROC AUC: 0.93 (+/- 0.10) [KNN]
ROC AUC: 0.97 (+/- 0.10) [Majority Voting]

Conclusion: As we can see, the performance of the MajorityVotingClassifier has substantially improved over the individual classifers in the 10-fold cross-validation evaluation

Reference：《Python Machine Learning》

• 本文已收录于以下专栏：

举报原因： 您举报文章：Implementing a simple majority vote classifer 色情 政治 抄袭 广告 招聘 骂人 其他 (最多只允许输入30个字)