Ensemble performance graph
from scipy.special import comb import math def ensemble_error(n_classifier, error): k_start = int(math.ceil(n_classifier / 2.)) probs = [comb(n_classifier, k) * error**k * (1-error)**(n_classifier - k) for k in range(k_start, n_classifier + 1)] #print(probs) return sum(probs) import numpy as np error_range = np.arange(0.0, 1.01, 0.01) ens_errors = [ensemble_error(n_classifier=100, error=error) for error in error_range] import matplotlib.pyplot as plt plt.plot(error_range, ens_errors, label='Ensemble error', linewidth=2) plt.plot(error_range, error_range, linestyle='--', label='Base error', linewidth=2) plt.xlabel('Base error') plt.ylabel('Base/Ensemble error') plt.legend(loc='upper left') plt.grid(alpha=0.5) plt.show()
MajorityVoteClassifier
implement by ourselves
clf是classifier的缩写
Estimator,中文意思就是估计器。在sklearn中,所有的机器学习模型都是Estimator,即他们都继承自Estimator,当然这个过程也可能涉及多重继承。
想要自己写classifier就要遵循这种格式
import pandas as pd import numpy as np from sklearn import datasets from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split from sklearn.base import BaseEstimator from sklearn.base import ClassifierMixin from sklearn.base import clone from sklearn.pipeline import _name_estimators from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import Pipeline from sklearn.model_selection import cross_val_score from sklearn.preprocessing import StandardScaler class MajorityVoteClassifier(BaseEstimator, ClassifierMixin): def __init__(self, classifiers, vote='classlabel', weights=None): self.classifiers = classifiers self.named_classifiers = {key: value for key, value in _name_estimators(classifiers)} self.vote = vote self.weights = weights def fit(self, X, y): if self.vote not in ('probability', 'classlabel'): raise ValueError("vote must be 'probability' or 'classlabel' ; got (vote=%r)" % self.vote) if self.weights and len(self.weights) != len(self.classifiers): raise ValueError('Number of classifiers and weights must be equal' '; got %d weights, %d classifiers' % (len(self.weights), len(self.classifiers))) self.lablenc_ = LabelEncoder() self.lablenc_.fit(y) self.classes_ = self.lablenc_.classes_ self.classifiers_ = [] for clf in self.classifiers: fitted_clf = clone(clf).fit(X, self.lablenc_.transform(y)) self.classifiers_.append(fitted_clf) return self def predict(self, X): if self.vote == 'probability': maj_vote = np.argmax(self.predict_proba(X), axis=1) else: predictions = np.asarray([clf.predict(X) for clf in self.classifiers_]).T maj_vote = np.apply_along_axis( lambda x: np.argmax(np.bincount(x, weights=self.weights)), axis=1, arr=predictions) maj_vote = self.lablenc_.inverse_transform(maj_vote) return maj_vote def predict_proba(self, X): probas = np.asarray([clf.predict_proba(X) for clf in self.classifiers_]) avg_proba = np.average(probas, axis=0, weights=self.weights) return avg_proba iris = datasets.load_iris() X = iris.data[50:, [1,2]] y = iris.target[50:] le = LabelEncoder() y = le.fit_transform(y) #y之前是1和2,编码之后变成0和1了 X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1, stratify=y) clf1 = LogisticRegression(penalty='l2', solver='lbfgs', C=0.001, random_state=0) clf2 = DecisionTreeClassifier(max_depth=1, criterion='entropy', random_state=0) clf3 = KNeighborsClassifier(n_neighbors=1, p=2, metric='minkowski') pipe1 = Pipeline([['sc', StandardScaler()], ['clf', clf1]]) pipe3 = Pipeline([['sc', StandardScaler()], ['clf', clf3]]) clf_labels = ['Logistic regression', 'Decision tree', 'KNN'] mv_clf = MajorityVoteClassifier(classifiers=[pipe1, clf2, pipe3]) clf_labels += ['Majority voting'] all_clf = [pipe1, clf2, pipe3, mv_clf] print('10-fold cross validation:') for clf, label in zip(all_clf, clf_labels): scores = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=10, scoring='roc_auc') print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
VotingClassifier
from sklearn.datasets import make_moons from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import VotingClassifier from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score X, y = make_moons(n_samples=300, noise=0.4) X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,stratify=y) log_clf = LogisticRegression() rnd_clf = RandomForestClassifier() svm_clf = SVC() voting_clf = VotingClassifier( estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)], voting='soft' ) voting_clf.fit(X_train, y_train) for clf in (log_clf, rnd_clf, svm_clf, voting_clf): clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print(clf.__class__.__name__, accuracy_score(y_test, y_pred))
Bagging and Pasting
上面提到的VotingClassifier是通过集成不同分类器进行集成分类
bagging和pasting则有些不同:集成序列中每一个预测器都是相同的,但是每个预测器在不同的训练集上进行训练,那么既然是不同的训练子集,那么就应该在样本集上进行子样本的抽取,这也是bagging 和 pasting 的区别:
- bagging是有放回的抽取
- pasting是没有放回的抽取
也就是说bagging 和 pasting 都允许在样本集上多次抽样,但是 pasting 因为是不放回的抽取,所以 pasting 的子集不会交叉
通过设置参数
max_samples
和max_features
我们可以指定子集大小和用于训练的特征的比例。参数 max_samples 和 max_features 控制子集的大小(在样本和特征方面)max_samples : int或float,可选(默认值= 1.0)
从X抽取以训练每个基本估计量的样本数。max_features : int或float,可选(默认值= 1.0)
从X绘制以训练每个基本估计量的要素数量。bootstrap : 布尔值,可选(默认= True)
是否抽取样本进行替换。如果为False,则执行不替换的采样。import pandas as pd from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split from sklearn.ensemble import BaggingClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score import numpy as np import matplotlib.pyplot as plt df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None) df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline'] df_wine = df_wine[df_wine['Class label'] != 1] y = df_wine['Class label'].values X = df_wine[['Alcohol', 'OD280/OD315 of diluted wines']].values le = LabelEncoder() y = le.fit_transform(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y) tree = DecisionTreeClassifier(criterion='entropy', max_depth=None, random_state=1) bag = BaggingClassifier(base_estimator=tree, n_estimators=500, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=1, random_state=1) tree = tree.fit(X_train, y_train) y_train_pred = tree.predict(X_train) y_test_pred = tree.predict(X_test) tree_train = accuracy_score(y_train, y_train_pred) tree_test = accuracy_score(y_test, y_test_pred) print('Decision tree train/test accuracies %.3f/%.3f' % (tree_train, tree_test)) bag = bag.fit(X_train, y_train) y_train_pred = bag.predict(X_train) y_test_pred = bag.predict(X_test) bag_train = accuracy_score(y_train, y_train_pred) bag_test = accuracy_score(y_test, y_test_pred) print('Bagging train/test accuracies %.3f/%.3f' % (bag_train, bag_test)) #后面的都是可视化 x_min = X_train[:, 0].min() - 1 x_max = X_train[:, 0].max() + 1 y_min = X_train[:, 1].min() - 1 y_max = X_train[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) f, axarr = plt.subplots(nrows=1, ncols=2, sharex='col', sharey='row', figsize=(8, 3)) #此时f和axarr只是一个画布,还没有给上面填东西呢,此时plt.show()的话是一片空白 for idx, clf, tt in zip([0, 1], [tree, bag], ['Decision tree', 'Bagging']): clf.fit(X_train, y_train) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) axarr[idx].contourf(xx, yy, Z, alpha=0.3) axarr[idx].scatter(X_train[y_train == 0, 0], X_train[y_train == 0, 1], c='blue', marker='^') axarr[idx].scatter(X_train[y_train == 1, 0], X_train[y_train == 1, 1], c='green', marker='o') axarr[idx].set_title(tt) axarr[0].set_ylabel('Alcohol', fontsize=12) plt.text(10.2, -0.5, s='OD280/OD315', ha='center', va='center', fontsize=12) plt.tight_layout() plt.show()
Boosting
Boosting既可以用于分类,也可以用于回归
Boosting属于迭代算法,它通过不断地使用一个弱学习器弥补前一个弱学习器的“不足”的过程,来串行地构造一个较强的学习器
常见的模型有: Adaboost、Gradient Boosting(GBT/GBDT/GBRT)、XGBoost、LightGBM。
最常用的是Adaboost和Gradient Boosting
Adaboost
在训练AdaBoost分类器时,该算法首先训练一个基础分类器(例如决策树),并使用它对训练集进行预测。 然后,该算法会增加分类错误的训练实例的相对权重。 然后,它使用更新的权重训练第二个分类器,并再次对训练集进行预测,更新实例权重,依此类推
import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.metrics import accuracy_score df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None) df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline'] df_wine = df_wine[df_wine['Class label'] != 1] y = df_wine['Class label'].values X = df_wine[['Alcohol', 'OD280/OD315 of diluted wines']].values le = LabelEncoder() y = le.fit_transform(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y) tree = DecisionTreeClassifier(criterion='entropy', max_depth=1, random_state=1) ada = AdaBoostClassifier(base_estimator=tree, n_estimators=500, learning_rate=0.1, random_state=1) tree = tree.fit(X_train, y_train) y_train_pred = tree.predict(X_train) y_test_pred = tree.predict(X_test) tree_train = accuracy_score(y_train, y_train_pred) tree_test = accuracy_score(y_test, y_test_pred) print('Decision tree train/test accuracies %.3f/%.3f' % (tree_train, tree_test)) ada = ada.fit(X_train, y_train) y_train_pred = ada.predict(X_train) y_test_pred = ada.predict(X_test) ada_train = accuracy_score(y_train, y_train_pred) ada_test = accuracy_score(y_test, y_test_pred) print('AdaBoost train/test accuracies %.3f/%.3f' % (ada_train, ada_test)) #后面的都是可视化 x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1 y_min, y_max = X_train[:, 1].min() - 1, X_train[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1)) f, axarr = plt.subplots(1, 2, sharex='col', sharey='row', figsize=(8, 3)) for idx, clf, tt in zip([0, 1], [tree, ada], ['Decision tree', 'AdaBoost']): clf.fit(X_train, y_train) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) axarr[idx].contourf(xx, yy, Z, alpha=0.3) axarr[idx].scatter(X_train[y_train == 0, 0], X_train[y_train == 0, 1], c='blue', marker='^') axarr[idx].scatter(X_train[y_train == 1, 0], X_train[y_train == 1, 1], c='green', marker='o') axarr[idx].set_title(tt) axarr[0].set_ylabel('Alcohol', fontsize=12) plt.text(10.2, -0.5, s='OD280/OD315 of diluted wines', ha='center', va='center', fontsize=12) plt.tight_layout() plt.show()
Gradient Boosting
Another very popular boosting algorithm is Gradient Boosting.
Just like AdaBoost, Gradient Boosting works by sequentially adding predictors to an ensemble, each one correcting its predecessor. However, instead of tweaking the instance weights at every iteration like AdaBoost does, this method tries to fit the new predictor to the residual errors made by the previous predictor.
Let’s go through a simple regression example, using Decision Trees as the base predictors (of course, Gradient Boosting also works great with regression tasks).This is called Gradient Tree Boosting, or Gradient Boosted Regression Trees (GBRT)
同样还有GBDT——GradientBoostingClassifier
from sklearn.tree import DecisionTreeRegressor import pandas as pd from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split df_wine = pd.read_csv('dataset/wine.data', header=None) df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline'] df_wine = df_wine[df_wine['Class label'] != 1] y = df_wine['Class label'].values X = df_wine[['Alcohol', 'OD280/OD315 of diluted wines']].values le = LabelEncoder() y = le.fit_transform(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y) tree_reg1 = DecisionTreeRegressor(max_depth=2) tree_reg1.fit(X_train, y_train) y2 = y_train - tree_reg1.predict(X_train) tree_reg2 = DecisionTreeRegressor(max_depth=2) tree_reg2.fit(X_train, y2) y3 = y2 - tree_reg2.predict(X_train) tree_reg3 = DecisionTreeRegressor(max_depth=2) tree_reg3.fit(X_train, y3) y_pred = sum(tree.predict(X_test) for tree in (tree_reg1, tree_reg2,tree_reg3)) #这个sum是把3个tree的对应位置相加 print(y_pred) print(y_test)
sklearn中的GradientBoostingRegressor
import pandas as pd from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split from sklearn.ensemble import GradientBoostingRegressor df_wine = pd.read_csv('dataset/wine.data', header=None) df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline'] df_wine = df_wine[df_wine['Class label'] != 1] y = df_wine['Class label'].values X = df_wine[['Alcohol', 'OD280/OD315 of diluted wines']].values le = LabelEncoder() y = le.fit_transform(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y) gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3,learning_rate=1.0) gbrt.fit(X_train, y_train) y_pred = gbrt.predict(X_test) print(y_pred)
sklearn中的GradientBoostingClassifier
import pandas as pd from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split from sklearn.ensemble import GradientBoostingClassifier from sklearn.metrics import accuracy_score df_wine = pd.read_csv('dataset/wine.data', header=None) df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline'] df_wine = df_wine[df_wine['Class label'] != 1] y = df_wine['Class label'].values X = df_wine[['Alcohol', 'OD280/OD315 of diluted wines']].values le = LabelEncoder() y = le.fit_transform(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y) gbrt = GradientBoostingClassifier(max_depth=2, n_estimators=3,learning_rate=1.0) gbrt.fit(X_train, y_train) y_pred = gbrt.predict(X_test) print('Accuracy: %.3f' % accuracy_score(y_pred,y_test))
XGBoost
XGBoost = Extreme Gradient Boosting.
XGBoost是Gradient Boosting的改进版本,是对GBDT的高效实现
回归器性能评估方法 MSE、MAE等
import pandas as pd from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split from sklearn.ensemble import GradientBoostingRegressor import xgboost from sklearn.metrics import mean_squared_error df_wine = pd.read_csv('dataset/wine.data', header=None) df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline'] df_wine = df_wine[df_wine['Class label'] != 1] y = df_wine['Class label'].values X = df_wine[['Alcohol', 'OD280/OD315 of diluted wines']].values le = LabelEncoder() y = le.fit_transform(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y) xgb_reg = xgboost.XGBRegressor() xgb_reg.fit(X_train, y_train) y_pred = xgb_reg.predict(X_test) print(y_pred) print(mean_squared_error(y_test, y_pred))
LightGBM
LigthGBM是boosting集合模型中的新进成员,它和xgboost一样是对GBDT的高效实现,很多方面会比xgboost表现的更为优秀。原理上它和GBDT及xgboot类似,都采用损失函数的负梯度作为当前决策树的残差近似值,去拟合新的决策树
LightGBM就是使用GOSS算法和EFB算法的梯度提升树(GBDT)
LightGBM算法是Kaggle竞赛的热门算法
lightbgm库的使用和sklearn中的模型不太一样
import matplotlib.pylab as plt import seaborn as sns import lightgbm as lgb import pandas as pd import numpy as np data_df = pd.read_csv('train.csv') label = data_df['TARGET'] feature = data_df.drop(['TARGET', 'ID'], axis=1) data_test = pd.read_csv('test.csv') data_test_ID = data_test['ID'] data_test_feature = data_test.drop(['ID'], axis=1) feature_all = pd.concat([feature, data_test_feature]) feature_all = pd.get_dummies(feature_all, dummy_na=True, columns=None) feature_train = feature_all.iloc[:len(feature), :] feature_test = feature_all.iloc[len(feature):] # 训练模型 def train_model(data_X, data_y): from sklearn.model_selection import train_test_split X_train, x_test, Y_train, y_test = train_test_split(data_X, data_y, test_size=0.2, random_state=3) # 创建成lgb特征的数据集格式,将使加载更快 lgb_train = lgb.Dataset(X_train, label=Y_train) lgb_eval = lgb.Dataset(x_test, label=y_test, reference=lgb_train) parameters = { 'task': 'train', 'max_depth': 15, 'boosting_type': 'gbdt', 'num_leaves': 20, # 叶子节点数 'n_estimators': 50, 'objective': 'binary', 'metric': 'auc', 'learning_rate': 0.2, 'feature_fraction': 0.7, # 小于 1.0, LightGBM 将会在每次迭代中随机选择部分特征. 'bagging_fraction': 1, # 类似于 feature_fraction, 但是它将在不进行重采样的情况下随机选择部分数据 'bagging_freq': 3, # bagging 的频率, 0 意味着禁用 bagging. k 意味着每 k 次迭代执行bagging 'lambda_l1': 0.5, 'lambda_l2': 0, 'cat_smooth': 10, # 用于分类特征,这可以降低噪声在分类特征中的影响, 尤其是对数据很少的类别 'is_unbalance': False, # 适合二分类。这里如果设置为True,评估结果降低3个点 'verbose': 0 } evals_result = {} # 记录训练结果所用 gbm_model = lgb.train(parameters, lgb_train, valid_sets=[lgb_train, lgb_eval], num_boost_round=50, # 提升迭代的次数 early_stopping_rounds=5, evals_result=evals_result, verbose_eval=10 ) prediction = gbm_model.predict(x_test, num_iteration=gbm_model.best_iteration) from sklearn.metrics import roc_auc_score roc_auc_score = roc_auc_score(y_test, prediction) print(roc_auc_score) return gbm_model, evals_result model, evals_result = train_model(feature_train, label)
https://www.cnblogs.com/lmcltj/p/11106336.html
Stacking
stacking 就是当用初始训练数据学习出若干个基学习器后,将这几个学习器的预测结果作为新的训练集,来学习一个新的学习器
stacking又叫模型融合
sklearn并没有直接支持stacking
from sklearn import datasets iris = datasets.load_iris() X, y = iris.data[:, 1:3], iris.target from sklearn import model_selection from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier from mlxtend.classifier import StackingClassifier import numpy as np clf1 = KNeighborsClassifier(n_neighbors=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() lr = LogisticRegression() sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr) print('3-fold cross validation:\n') for clf, label in zip([clf1, clf2, clf3, sclf], ['KNN', 'Random Forest', 'Naive Bayes', 'StackingClassifier']): scores = model_selection.cross_val_score(clf, X, y, cv=3, scoring='accuracy') print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
https://zhuanlan.zhihu.com/p/61705517
COMP7404 Machine Learing——Ensemble Learning& MajorityVoteClassifier & Bagging & Boosting(Adaboost)
最新推荐文章于 2024-03-15 19:37:54 发布