集成学习器bagging的实现(个体学习器用的sklearn的第三方库)
代码:
# coding=utf-8
from sklearn import tree, preprocessing,neural_network,naive_bayes,svm
import pandas as pd
from collections import defaultdict
from sklearn.utils import column_or_1d
class Bagging(object):
def __init__(self,file_name,Class_Values):
self.file_name = file_name
self.Class_Values = Class_Values
self.Xtrain,self.Ytrain,self.Xtest,self.Ytest = self.read(self.file_name,self.Class_Values)
self.clf1,self.clf2,self.clf3,self.clf4,self.clf5,self.TEST = self.CLT()
def read(self,file_name,Class_Values):
"""
:param file_name: 文件名,绝对路径或url
:param Class_Values: 分类标签KEY
:return: Xtrain,Ytrain,Xtest,Ytest X为数据Y为标签
"""
data = pd.read_csv(file_name)
train,test = self.Bootstrap(data)
Xtrain = train.iloc[:, train.columns != Class_Values]
Xtrain = preprocessing.scale(Xtrain) #数据标准化(Z-Score)
Ytrain = train.iloc[:,train.columns==Class_Values]
Xtest = test.iloc[:, test.columns != Class_Values]
Xtest = preprocessing.scale(Xtest) #数据标准化(Z-Score)
Ytest = test.iloc[:, test.columns == Class_Values]
return Xtrain,Ytrain,Xtest,Ytest
def Bootstrap(self,data):
"""自助法产生训练集与测试集"""
train = data.sample(frac=1.0, replace=True)
test = data.loc[data.index.difference(train.index)].copy()
return train,test
def DecisionTreeClassifier(self,X,Y):
"""
:param X:用 [n_samples, n_features] 的方式来存放训练样本
:param Y:用 [n_samples] 来保存训练样本的类标签
:return:训练完后的决策树
"""
ctf = tree.DecisionTreeClassifier()
ctf.fit(X,Y)
return ctf
def DecisionTreeClassifier_Correctly_Instances(self,Xtest, Ytest):
"""返回单个学习器决策树clt1的准确率"""
percent = self.clf1.score(Xtest, Ytest)
return percent
def MLPClassifier(self,X,Y):
"""
:param X:用 [n_samples, n_features] 的方式来存放训练样本
:param Y:用 [n_samples] 来保存训练样本的类标签
:return:训练完后的神经网络
"""
ctf = neural_network.MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(10, 10), random_state=1, activation='tanh')
ctf.fit(X,Y)
return ctf
def MLPClassifier_Correctly_Instances(self,Xtest, Ytest):
"""返回单个学习器神经网络clt2的准确率"""
percent = self.clf2.score(Xtest,Ytest)
return percent
def BernoulliNB(self,X,Y):
"""
:param X:用 [n_samples, n_features] 的方式来存放训练样本
:param Y:用 [n_samples] 来保存训练样本的类标签
:return:训练完后的伯努利贝叶斯
"""
ctf = naive_bayes.BernoulliNB()
ctf.fit(X,Y)
return ctf
def BernoulliNB_Correctly_Instances(self,Xtest, Ytest):
"""返回单个学习器伯努利贝叶斯clt3的准确率"""
percent = self.clf3.score(Xtest, Ytest)
return percent
def GaussianNB(self, X, Y):
"""
:param X:用 [n_samples, n_features] 的方式来存放训练样本
:param Y:用 [n_samples] 来保存训练样本的类标签
:return:训练完后的高斯贝叶斯
"""
ctf = naive_bayes.GaussianNB()
ctf.fit(X, Y)
return ctf
def GaussianNB_Correctly_Instances(self, Xtest, Ytest):
"""返回单个学习器高斯贝叶斯clt4的准确率"""
percent = self.clf4.score(Xtest, Ytest)
return percent
def SVC(self, X, Y):
"""
:param X:用 [n_samples, n_features] 的方式来存放训练样本
:param Y:用 [n_samples] 来保存训练样本的类标签
:return:训练完后的支持向量机
"""
ctf = svm.SVC()
ctf.fit(X, Y)
return ctf
def SVC_Correctly_Instances(self, Xtest, Ytest):
"""返回单个学习器支持向量机clt5的准确率"""
percent = self.clf5.score(Xtest, Ytest)
return percent
def main_bagging(self,X,Y,c1=1,c2=1,c3=1,c4=1,c5=1,c6=1):
"""
:ci 通过观察K统计量,选取多元性大的单个学习器,令ci!=1即可剔除该学习器
:param success_number: 成功预测的个数
:param length: 测试集X的长度
:return: correctly classify instance
"""
Y = column_or_1d(Y, warn=True)
length = len(X)
success_number=0
for i in range(length):
res = defaultdict(int)
a1 = self.clf1.predict([X[i]])
a2 = self.clf2.predict([X[i]])
a3 = self.clf3.predict([X[i]])
a4 = self.clf4.predict([X[i]])
a5 = self.clf5.predict([X[i]])
if c1 == 1:
res[str(a1)] += 1
if c2 == 1:
res[str(a2)] += 1
if c3 == 1:
res[str(a3)] += 1
if c4 == 1:
res[str(a4)] += 1
if c5 == 1:
res[str(a5)] += 1
RES=sorted(res, key=lambda x: res[x])[-1] #投票,RES为投票最多的预测结果
# print(RES)
# print(type(RES))
Biaoqian=str([Y[i]])
if Biaoqian == RES: #与标签匹配
success_number += 1
# print(success_number,length)
return (success_number/length)
#k统计量
def duoyangxing(self,H1,H2,X):
a,b,c,d = 0,0,0,0
length = len(X)
# print(H1.predict([X[1]]),type(H1.predict([X[1]])))
for i in range(length):
if str(H1.predict([X[i]])) == '[0]' and str(H2.predict([X[i]])) == '[0]':
a += 1
elif str(H1.predict([X[i]])) == '[0]' and str(H2.predict([X[i]])) == '[1]':
b += 1
elif str(H1.predict([X[i]])) == '[1]' and str(H2.predict([X[i]])) == '[0]':
c += 1
elif str(H1.predict([X[i]]))=='[1]' and str(H2.predict([X[i]]))=='[1]':
d += 1
m = a+b+c+d
p1 = (a+d)/m
p2 = ((a+b)*(a+c)+(c+d)*(b+d))/m**2
k = (p1-p2)/(1-p2)
# print(k)
return k
def show_duoyuanxing(self):
k1 = self.duoyangxing(self.clf1,self.clf2,self.Xtrain)
k2 = self.duoyangxing(self.clf1,self.clf3,self.Xtrain)
k3 = self.duoyangxing(self.clf1,self.clf4,self.Xtrain)
k4 = self.duoyangxing(self.clf1,self.clf5,self.Xtrain)
k5 = self.duoyangxing(self.clf2,self.clf3,self.Xtrain)
k6 = self.duoyangxing(self.clf2,self.clf4,self.Xtrain)
k7 = self.duoyangxing(self.clf2,self.clf5,self.Xtrain)
k8 = self.duoyangxing(self.clf3,self.clf4,self.Xtrain)
k9 = self.duoyangxing(self.clf3,self.clf5,self.Xtrain)
k10 = self.duoyangxing(self.clf4,self.clf5,self.Xtrain)
K = [k1,k2,k3,k4,k5,k6,k7,k8,k9,k10]
print('k统计量',K)
return K
def CLT(self):
TR11, TR12, TR13, TR14 = self.read(self.file_name, self.Class_Values)
TR21, TR22, TR23, TR24 = self.read(self.file_name, self.Class_Values)
TR31, TR32, TR33, TR34 = self.read(self.file_name, self.Class_Values)
TR41, TR42, TR43, TR44 = self.read(self.file_name, self.Class_Values)
TR51, TR52, TR53, TR54 = self.read(self.file_name, self.Class_Values)
clf1 = self.DecisionTreeClassifier(TR11, TR12)
clf2 = self.MLPClassifier(TR21, TR22)
clf3 = self.BernoulliNB(TR31, TR32)
clf4 = self.GaussianNB(TR41, TR42)
clf5 = self.SVC(TR51, TR52)
TEST=[ TR13, TR14, TR23, TR24, TR33, TR34, TR43, TR44, TR53, TR54]
return clf1, clf2, clf3, clf4, clf5, clf6, TEST
def CLT_PERCENT(self):
clf1_percent = self.DecisionTreeClassifier_Correctly_Instances(self.TEST[0],self.TEST[1])
clf2_percent = self.MLPClassifier_Correctly_Instances(self.TEST[2],self.TEST[3])
clf3_percent = self.BernoulliNB_Correctly_Instances(self.TEST[4],self.TEST[5])
clf4_percent = self.GaussianNB_Correctly_Instances(self.TEST[6],self.TEST[7])
clf5_percent = self.SVC_Correctly_Instances(self.TEST[8],self.TEST[9])
print('决策树',clf1_percent)
print('神经网络',clf2_percent)
print('伯努利贝叶斯',clf3_percent)
print('高斯贝叶斯',clf4_percent)
print('支持向量机',clf5_percent)
print('单个学习器正确率平均值',(clf1_percent+clf2_percent+clf3_percent+clf4_percent+clf5_percent+clf6_percent)/6)
if __name__=='__main__':
A=Bagging('haberman.csv','status')
A.CLT_PERCENT()
A.show_duoyuanxing()
print('集成学习器测试集正确率', A.main_bagging(A.Xtrain, A.Ytrain, c3=0,c5=0,c2=0))