origin from: datawhale
投票法(以KNN为例)
简介
投票法是一种少数服从多数原则的集成学习模型。通过多个模型的集成降低方差,从而提高模型的鲁棒性。理想情况下,好的预测效果应当优于任何一个基模型的预测效果。对于回归模型来说,投票法最终的预测结果是多个其他回归模型预测结果的平均值。对于分类模型,硬投票法预测结果是多个模型预测结果中出现次数最多的类别;软投票对各类预测结果的概率进行求和,最终选取概率之和最大的类标签。
在投票法中,我们还需要考虑到不同的基模型可能产生的影响。理论上,基模型可以是任何已被训练好的模型。但在实际应用上,想要投票法产生较好的结果,需要满足两个条件:
基模型之间的效果不能差别过大。当某个基模型相对于其他基模型效果过差时,该模型很可能成为噪声。
基模型之间应该有较小的同质性。例如在基模型预测效果近似的情况下,基于树模型与线性模型的投票,往往优于两个树模型或两个线性模型。
当投票合集中使用的模型能预测出清晰的类别标签时,适合使用硬投票。当投票集合中使用的模型能预测类别的概率时,适合使用软投票。软投票同样可以用于那些本身并不预测类成员概率的模型,只要他们可以输出类似于概率的预测分数值(例如支持向量机、k-最近邻和决策树)。
投票法的局限性在于,它对所有模型的处理是一样的,这意味着所有模型对预测的贡献是一样的。如果一些模型在某些情况下很好,而在其他情况下很差,这是使用投票法时需要考虑到的一个问题。
案例
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
#创建一个1000个样本,20个特征的随机数据集
from sklearn.datasets import make_classification
def get_dataset():
x, y=make_classification(n_samples=1000,n_features=20,n_informative=15,n_redundant=5,random_state=2)
print(x.shape,y.shape)
return x,y
#使用多个KNN模型作为基模型演示投票法,其中每个模型采用不同的邻居值K参数:
from sklearn.neighbors import KNeighborsClassifier
def get_voting():
# define the base models
models = list()
models.append(('knn1', KNeighborsClassifier(n_neighbors=1)))
models.append(('knn3', KNeighborsClassifier(n_neighbors=3)))
models.append(('knn5', KNeighborsClassifier(n_neighbors=5)))
models.append(('knn7', KNeighborsClassifier(n_neighbors=7)))
models.append(('knn9', KNeighborsClassifier(n_neighbors=9)))
# define the voting ensemble
ensemble = VotingClassifier(estimators=models, voting='hard')
return ensemble
#创建一个模型列表来评估投票带来的提升,包括KNN模型配置的每个独立版本和硬投票模型。下面的get_models()函数可以为我们创建模型列表进行评估。
# get a list of models to evaluate
def get_models():
models = dict()
models['knn1'] = KNeighborsClassifier(n_neighbors=1)
models['knn3'] = KNeighborsClassifier(n_neighbors=3)
models['knn5'] = KNeighborsClassifier(n_neighbors=5)
models['knn7'] = KNeighborsClassifier(n_neighbors=7)
models['knn9'] = KNeighborsClassifier(n_neighbors=9)
models['hard_voting'] = get_voting()
return models
#下面的evaluate_model()函数接收一个模型实例,并以分层10倍交叉验证三次重复的分数列表的形式返回。
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
# evaluate a give model using cross-validation
def evaluate_model(model, x, y):
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, x, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
return scores
#报告每个算法的平均性能,还可以创建一个箱形图和须状图来比较每个算法的精度分数分布。
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy import stats
x,y=get_dataset()
models=get_models()
results,names=list(),list()
for name, model in models.items():
scores=evaluate_model(model,x,y)
results.append(scores)
names.append(name)
print('>%s %.3f (%.3f)' % (name, np.mean(scores),np.std(scores)))
plt.boxplot(results,labels=names,showmeans=True)
plt.show()
KNN算法实现
import numpy as np
import operator
from os import listdir
def classify(inX, dataSet, labels, k):
dataSetSize=dataSet.shape[0]#返回行数
diffMat=np.tile(inX, (dataSetSize,1))-dataSet
sqDiffMat=diffMat**2
sqDistances=sqDiffMat.sum(axis=1)
distances=sqDistances**0.5
sortedDistIndices=distances.argsort()
classCount={}
for i in range(k):
voteIlabel=labels[sortedDistIndices[i]]
classCount[voteIlabel]=classCount.get(voteIlabel,0)+1
sortedClassCount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
return sortedClassCount[0][0]
Bagging
简介
Baggin的核心在于自助采样——有放回的从数据集中进行采样,即同一个样本可能被多次进行采样。
Baggin的基本流程:首先随机取出一个样本,放入采样集合中,把这个样本放回初始数据集,重复K次采样获得一个大小为K的样本集合。然后用同样的方法T个含有K个样本的采样集合,基于每个采样集合训练出一个基学习器,再将这些基学习器进行结合。
对回归问题的预测是通过预测取平均值来进行的。对于分类问题的预测是通过对预测取多数票预测来进行的。Bagging方法之所以有效,是因为每个模型都是在略微不同的训练数据集上拟合完成的,这又使得每个基模型之间存在略微的差异,使每个基模型拥有略微不同的训练能力。
Bagging同样是一种降低方差的技术,因此它在不剪枝决策树、神经网络等易受样本扰动的学习器上效果更加明显。在实际的使用中,加入列采样的Bagging技术对高维小样本往往有神奇的效果。
下图是单一随机森林和基于Baggin的随机森林效果比较
案例
#sklearn里实现
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
bag_clf=BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1)
bag_clf.fit(X_train,y_train)
y_pred=bag_clf.predict(X_test)
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import BaggingClassifier
# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=5)
# define the model
model = BaggingClassifier()
# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
Accuracy: 0.855 (0.033)
随机森林算法实现
参考链接:https://blog.csdn.net/LJBlog2014/article/details/39737471
from __future__ import division
import numpy as np
import math
class node:
def __init__(self, col=-1, value=None, results=None, trueBranch=None, falseBranch=None):
self.col = col
self.value = value
self.results = results
self.trueBranch = trueBranch
self.falseBranch = falseBranch
def getLabel(self):
if self.results == None:
return None
else:
max_counts = 0
for key in self.results.keys():
if self.results[key] > max_counts:
label = key
max_counts = self.results[key]
return label
class RandomForestsClassifier:
def __init__(self, n_bootstrapSamples=20):
self.n_bootstrapSamples = n_bootstrapSamples
self.list_tree = []
def divideSet(self, samples, column, value):
splitFunction = None
if isinstance(value,int) or isinstance(value,float):
splitFunction = lambda row: row[column] >= value
else:
splitFunction = lambda row: row[column] == value
set1 = [row for row in samples if splitFunction(row)]
set2 = [row for row in samples if not splitFunction(row)]
return (set1,set2)
def uniqueCounts(self, samples):
results = {}
for row in samples:
r = row[len(row)-1]
if r not in results:
results[r] = 0
results[r] = results[r]+1
return results
def giniEstimate(self, samples):
if len(samples)==0: return 0
total = len(samples)
counts = self.uniqueCounts(samples)
gini = 0
for target in counts:
gini = gini + pow(counts[target],2)
gini = 1 - gini / pow(total,2)
return gini
def buildTree(self, samples):#构造CART决策树
if len(samples) == 0:
return node()
currentGini = self.giniEstimate(samples)
bestGain = 0
bestCriteria = None
bestSets = None
colCount = len(samples[0]) - 1
colRange = range(0,colCount)
np.random.shuffle(colRange)
for col in colRange[0:int(math.ceil(math.sqrt(colCount)))]:
colValues = {}
for row in samples:
colValues[row[col]] = 1
for value in colValues.keys():
(set1,set2) = self.divideSet(samples,col,value)
gain = currentGini - (len(set1)*self.giniEstimate(set1) + len(set2)*self.giniEstimate(set2)) / len(samples)
if gain > bestGain and len(set1) > 0 and len(set2) > 0:
bestGain = gain
bestCriteria = (col,value)
bestSets = (set1,set2)
if bestGain > 0:
trueBranch = self.buildTree(bestSets[0])
falseBranch = self.buildTree(bestSets[1])
return node(col=bestCriteria[0],value=bestCriteria[1],trueBranch=trueBranch,falseBranch=falseBranch)
else:
return node(results=self.uniqueCounts(samples))
def printTree(self, tree,indent=' '):#以文本形式显示决策树
if tree.results != None:
print str(tree.results)
else:
print str(tree.col)+':'+str(tree.value)+'?'
print indent+'T->',
self.printTree(tree.trueBranch,indent+' ')
print indent+'F->',
self.printTree(tree.falseBranch,indent+' ')
def predict_tree(self, observation, tree):#利用决策树进行分类
if tree.results != None:
return tree.getLabel()
else:
v = observation[tree.col]
branch = None
if isinstance(v,int) or isinstance(v,float):
if v >= tree.value: branch = tree.trueBranch
else: branch = tree.falseBranch
else:
if v == tree.value: branch = tree.trueBranch
else: branch = tree.falseBranch
return self.predict_tree(observation,branch)
def generateBootstrapSamples(self, data):#构造bootstrap样本
samples = []
for i in range(len(data)):
samples.append(data[np.random.randint(len(data))])
return samples
def fit(self, data):#构造随机森林
for i in range(self.n_bootstrapSamples):
samples = self.generateBootstrapSamples(data)
currentTree = self.buildTree(samples)
self.list_tree.append(currentTree)
def predict_randomForests(self, observation):#利用随机森林对给定观测数据进行分类
results = {}
for i in range(len(self.list_tree)):
currentResult = self.predict_tree(observation, self.list_tree[i])
if currentResult not in results:
results[currentResult] = 0
results[currentResult] = results[currentResult] + 1
max_counts = 0
for key in results.keys():
if results[key] > max_counts:
finalResult = key
max_counts = results[key]