集成学习器bagging

集成学习器bagging的实现(个体学习器用的sklearn的第三方库)
代码:

# coding=utf-8
from sklearn import tree, preprocessing,neural_network,naive_bayes,svm
import pandas as pd
from collections import defaultdict
from sklearn.utils import column_or_1d


class Bagging(object):
    def __init__(self,file_name,Class_Values):
        self.file_name = file_name
        self.Class_Values = Class_Values
        self.Xtrain,self.Ytrain,self.Xtest,self.Ytest = self.read(self.file_name,self.Class_Values)
        self.clf1,self.clf2,self.clf3,self.clf4,self.clf5,self.TEST = self.CLT()
    def read(self,file_name,Class_Values):
        """
        :param file_name: 文件名,绝对路径或url
        :param Class_Values: 分类标签KEY
        :return: Xtrain,Ytrain,Xtest,Ytest  X为数据Y为标签
        """
        data = pd.read_csv(file_name)
        train,test = self.Bootstrap(data)
        Xtrain = train.iloc[:, train.columns != Class_Values]
        Xtrain = preprocessing.scale(Xtrain)    #数据标准化(Z-Score)
        Ytrain = train.iloc[:,train.columns==Class_Values]
        Xtest = test.iloc[:, test.columns != Class_Values]
        Xtest = preprocessing.scale(Xtest)      #数据标准化(Z-Score)
        Ytest = test.iloc[:, test.columns == Class_Values]
        return Xtrain,Ytrain,Xtest,Ytest
    def Bootstrap(self,data):
        """自助法产生训练集与测试集"""
        train = data.sample(frac=1.0, replace=True)
        test = data.loc[data.index.difference(train.index)].copy()
        return train,test

    def DecisionTreeClassifier(self,X,Y):
        """
        :param X:用 [n_samples, n_features] 的方式来存放训练样本
        :param Y:用 [n_samples] 来保存训练样本的类标签
        :return:训练完后的决策树
        """

        ctf = tree.DecisionTreeClassifier()
        ctf.fit(X,Y)
        return ctf

    def DecisionTreeClassifier_Correctly_Instances(self,Xtest, Ytest):
        """返回单个学习器决策树clt1的准确率"""
        percent = self.clf1.score(Xtest, Ytest)
        return percent

    def MLPClassifier(self,X,Y):
        """
        :param X:用 [n_samples, n_features] 的方式来存放训练样本
        :param Y:用 [n_samples] 来保存训练样本的类标签
        :return:训练完后的神经网络
        """
        ctf = neural_network.MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(10, 10), random_state=1, activation='tanh')
        ctf.fit(X,Y)
        return ctf

    def MLPClassifier_Correctly_Instances(self,Xtest, Ytest):
        """返回单个学习器神经网络clt2的准确率"""
        percent = self.clf2.score(Xtest,Ytest)
        return percent

    def BernoulliNB(self,X,Y):
        """
        :param X:用 [n_samples, n_features] 的方式来存放训练样本
        :param Y:用 [n_samples] 来保存训练样本的类标签
        :return:训练完后的伯努利贝叶斯
        """
        ctf = naive_bayes.BernoulliNB()
        ctf.fit(X,Y)
        return ctf

    def BernoulliNB_Correctly_Instances(self,Xtest, Ytest):
        """返回单个学习器伯努利贝叶斯clt3的准确率"""
        percent = self.clf3.score(Xtest, Ytest)
        return percent

    def GaussianNB(self, X, Y):
        """
        :param X:用 [n_samples, n_features] 的方式来存放训练样本
        :param Y:用 [n_samples] 来保存训练样本的类标签
        :return:训练完后的高斯贝叶斯
        """
        ctf = naive_bayes.GaussianNB()
        ctf.fit(X, Y)
        return ctf

    def GaussianNB_Correctly_Instances(self, Xtest, Ytest):
        """返回单个学习器高斯贝叶斯clt4的准确率"""
        percent = self.clf4.score(Xtest, Ytest)
        return percent

    def SVC(self, X, Y):
        """
        :param X:用 [n_samples, n_features] 的方式来存放训练样本
        :param Y:用 [n_samples] 来保存训练样本的类标签
        :return:训练完后的支持向量机
        """
        ctf = svm.SVC()
        ctf.fit(X, Y)
        return ctf

    def SVC_Correctly_Instances(self, Xtest, Ytest):
        """返回单个学习器支持向量机clt5的准确率"""
        percent = self.clf5.score(Xtest, Ytest)
        return percent


    def main_bagging(self,X,Y,c1=1,c2=1,c3=1,c4=1,c5=1,c6=1):
        """
        :ci 通过观察K统计量,选取多元性大的单个学习器,令ci!=1即可剔除该学习器
        :param success_number: 成功预测的个数
        :param length: 测试集X的长度
        :return: correctly classify instance
        """
        Y = column_or_1d(Y, warn=True)
        length = len(X)
        success_number=0
        for i in range(length):
            res = defaultdict(int)
            a1 = self.clf1.predict([X[i]])
            a2 = self.clf2.predict([X[i]])
            a3 = self.clf3.predict([X[i]])
            a4 = self.clf4.predict([X[i]])
            a5 = self.clf5.predict([X[i]])
       
            if c1 == 1:
                res[str(a1)] += 1
            if c2 == 1:
                res[str(a2)] += 1
            if c3 == 1:
                res[str(a3)] += 1
            if c4 == 1:
                res[str(a4)] += 1
            if c5 == 1:
                res[str(a5)] += 1
            
            RES=sorted(res, key=lambda x: res[x])[-1] #投票,RES为投票最多的预测结果
            # print(RES)
            # print(type(RES))
            Biaoqian=str([Y[i]])
            if Biaoqian == RES:  #与标签匹配
                success_number += 1
        # print(success_number,length)
        return (success_number/length)
       #k统计量
    def duoyangxing(self,H1,H2,X):
        a,b,c,d = 0,0,0,0
        length = len(X)
        # print(H1.predict([X[1]]),type(H1.predict([X[1]])))
        for i in range(length):
            if str(H1.predict([X[i]])) == '[0]' and str(H2.predict([X[i]])) == '[0]':
                a += 1
            elif str(H1.predict([X[i]])) == '[0]' and str(H2.predict([X[i]])) == '[1]':
                b += 1
            elif str(H1.predict([X[i]])) == '[1]' and str(H2.predict([X[i]])) == '[0]':
                c += 1
            elif str(H1.predict([X[i]]))=='[1]'  and str(H2.predict([X[i]]))=='[1]':
                d += 1
        m = a+b+c+d
        p1 = (a+d)/m
        p2 = ((a+b)*(a+c)+(c+d)*(b+d))/m**2
        k = (p1-p2)/(1-p2)
        # print(k)
        return k
    
    def show_duoyuanxing(self):
        k1 = self.duoyangxing(self.clf1,self.clf2,self.Xtrain)
        k2 = self.duoyangxing(self.clf1,self.clf3,self.Xtrain)
        k3 = self.duoyangxing(self.clf1,self.clf4,self.Xtrain)
        k4 = self.duoyangxing(self.clf1,self.clf5,self.Xtrain)
        k5 = self.duoyangxing(self.clf2,self.clf3,self.Xtrain)
        k6 = self.duoyangxing(self.clf2,self.clf4,self.Xtrain)
        k7 = self.duoyangxing(self.clf2,self.clf5,self.Xtrain)
        k8 = self.duoyangxing(self.clf3,self.clf4,self.Xtrain)
        k9 = self.duoyangxing(self.clf3,self.clf5,self.Xtrain)
        k10 = self.duoyangxing(self.clf4,self.clf5,self.Xtrain)
        K = [k1,k2,k3,k4,k5,k6,k7,k8,k9,k10]
        print('k统计量',K)
        return K

    def CLT(self):
        TR11, TR12, TR13, TR14 = self.read(self.file_name, self.Class_Values)
        TR21, TR22, TR23, TR24 = self.read(self.file_name, self.Class_Values)
        TR31, TR32, TR33, TR34 = self.read(self.file_name, self.Class_Values)
        TR41, TR42, TR43, TR44 = self.read(self.file_name, self.Class_Values)
        TR51, TR52, TR53, TR54 = self.read(self.file_name, self.Class_Values)
        clf1 = self.DecisionTreeClassifier(TR11, TR12)
        clf2 = self.MLPClassifier(TR21, TR22)
        clf3 = self.BernoulliNB(TR31, TR32)
        clf4 = self.GaussianNB(TR41, TR42)
        clf5 = self.SVC(TR51, TR52)
  
        TEST=[ TR13, TR14, TR23, TR24, TR33, TR34, TR43, TR44, TR53, TR54]
        return clf1, clf2, clf3, clf4, clf5, clf6, TEST
    def CLT_PERCENT(self):
        clf1_percent = self.DecisionTreeClassifier_Correctly_Instances(self.TEST[0],self.TEST[1])
        clf2_percent = self.MLPClassifier_Correctly_Instances(self.TEST[2],self.TEST[3])
        clf3_percent = self.BernoulliNB_Correctly_Instances(self.TEST[4],self.TEST[5])
        clf4_percent = self.GaussianNB_Correctly_Instances(self.TEST[6],self.TEST[7])
        clf5_percent = self.SVC_Correctly_Instances(self.TEST[8],self.TEST[9])
  
        print('决策树',clf1_percent)
        print('神经网络',clf2_percent)
        print('伯努利贝叶斯',clf3_percent)
        print('高斯贝叶斯',clf4_percent)
        print('支持向量机',clf5_percent)
        print('单个学习器正确率平均值',(clf1_percent+clf2_percent+clf3_percent+clf4_percent+clf5_percent+clf6_percent)/6)

if __name__=='__main__':
    A=Bagging('haberman.csv','status')
    A.CLT_PERCENT()
    A.show_duoyuanxing()
    print('集成学习器测试集正确率', A.main_bagging(A.Xtrain, A.Ytrain, c3=0,c5=0,c2=0))
  • 1
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值