《机器学习实战中文版》机器学习分类学习器性能测试

最新推荐文章于 2024-08-12 19:10:16 发布

香槟酒气满天飞

最新推荐文章于 2024-08-12 19:10:16 发布

阅读量226

点赞数

分类专栏：啃书笔记（机器学习实战中文版）文章标签：机器学习

本文链接：https://blog.csdn.net/kangxiatao/article/details/109452707

版权

啃书笔记（机器学习实战中文版）专栏收录该内容

10 篇文章 0 订阅

订阅专栏

机器学习分类学习器性能测试

1. 数据部分

从文件中读取数据
特征为马的某些医院检测的指标
标签为是否属于疝气病症
在本次测试中只选取了部分特征
list格式数据

2. 学习器部分

K-近邻居算法
决策树
朴素贝叶斯
Logistic回归
支持向量机

集成学习

# 选择学习器
# KNN, Tree, Bayes, Logistic, SVM, Adaboost, ALL
choiceModel = 'ALL'

3. 结果

学习器对于分类的错误率

4. 全部代码

所有分类器算法都是机器学习实战中文版这本书上的代码
训练集和数据集是horseColicTraining2.txt和horseColicTest2.txt

# -*- coding: utf-8 -*-
# author: AnoI  time:2020/10/29

import kNN
import trees, treePlotter
import bayes
import logRegres
import svmMLiA
import adaboost
from numpy import *  # 科学计算包
import copy
import matplotlib.pyplot as plt
import random


# ---------------------- 数据部分 ----------------------
'''
从文件中读取数据
特征为马的某些医院检测的指标
标签为是否属于疝气病症
在本次测试中只选取了部分特征
list格式
'''
frTrain = open('horseColicTraining2.txt')
frTest = open('horseColicTest2.txt')
trainingSet = []
trainingLabels = []
testingSet = []
testingLabels = []
featureLabels = []  # 特征的描述，这里并不清楚，用数字描述
featureNum = 20  # 选取特征数
for line in frTrain.readlines():
    currLine = line.strip().split('\t')
    lineArr = []
    for i in range(featureNum):
        lineArr.append(float(currLine[i]))
    trainingSet.append(lineArr)
    trainingLabels.append(float(currLine[21]))
for line in frTest.readlines():
    currLine = line.strip().split('\t')
    lineArr = []
    for i in range(featureNum):
        lineArr.append(float(currLine[i]))
    testingSet.append(lineArr)
    testingLabels.append(float(currLine[21]))
for i in range(featureNum):
    featureLabels.append('%d' % i)

trainLen = len(trainingLabels)
testLen = len(testingLabels)
print('trainLen:', trainLen)
print('testLen:', testLen)
# print(featureLabels)
# print(trainingSet, '\n', trainingLabels)
# print(testingSet, '\n', testingLabels)

# -------------------------------------------------------


# ---------------------- 学习器部分 ----------------------
# 选择学习器
# KNN, Tree, Bayes, Logistic, SVM, Adaboost, ALL
choiceModel = 'ALL'

# 学习器错误率
errorRateKNN = 0.0
errorRateTree = 0.0
errorRateBayes = 0.0
errorRateLogistic = 0.0
errorRateSVM = 0.0
errorRateAdaboost = 0.0



if choiceModel == 'KNN' or choiceModel == 'ALL':

    print("----- kNN -----")

    # 先归一化训练集和测试机的数据
    normTrain, ranges0, minVals0 = kNN.autoNorm(array(trainingSet))
    normTest, ranges1, minVals1 = kNN.autoNorm(array(testingSet))
    errorCount = 0.0
    numTestVecs = testLen
    for i in range(numTestVecs):
        classifierResult = kNN.classify0(normTest[i], normTrain, trainingLabels, 10)
        if (classifierResult != testingLabels[i]): errorCount += 1.0
    errorRateKNN = errorCount / float(numTestVecs)
    print("\nkNN error rate is: %f" % errorRateKNN, "\nerrorCount: ", errorCount)


if choiceModel == 'Tree' or choiceModel == 'ALL':

    print("----- Tree -----")

    # 下面是最开始的方法，但是我找到了好的方法，好好学python是不可能的，哥全靠猜
    # 这里就离谱，没找到不引用的复制方式
    # dataSet = []
    # for i in range(trainLen):
    #     dataArr = trainingSet[i][:]
    #     dataArr.append(trainingLabels[i])
    #     dataSet.append(dataArr)

    # 终于不是引用了，trainingSet[0]的长度不会变化了
    # print(len(trainingSet[0]))
    dataSet = copy.deepcopy(trainingSet)
    for i in range(trainLen):
        dataSet[i].append(trainingLabels[i])
    # print(len(trainingSet[0]))
    # print(len(dataSet))
    # print(len(dataSet[0]))

    # 香农熵
    shan = trees.calcShannonEnt(dataSet)
    print("香农熵:", shan)
    # 创建的树
    mytree = trees.createTree(dataSet, featureLabels)
    # 画出树
    # treePlotter.createPlot(mytree)

    numTestVecs = testLen
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = trees.classify(mytree, featureLabels, testingSet[i])
        if (classifierResult != testingLabels[i]): errorCount += 1.0
    errorRateTree = errorCount / float(numTestVecs)
    print("\nTree error rate is: %f" % errorRateTree, "\nerrorCount: ", errorCount)


if choiceModel == 'Bayes' or choiceModel == 'ALL':

    print("----- Bayes -----")

    myList = bayes.createVocabList(trainingSet)
    trainMat = []
    for pl in trainingSet:
        trainMat.append(bayes.setOfWords2Vec(myList, pl))
    p0V, p1V, pAb = bayes.trainNB0(trainMat, trainingLabels)
    # print(p0V, p1V, pAb)

    numTestVecs = testLen
    errorCount = 0.0
    for i in range(numTestVecs):
        thisTesting = array(bayes.setOfWords2Vec(myList, testingSet[i]))
        classifierResult = bayes.classifyNB(thisTesting, p0V, p1V, pAb)
        if classifierResult == 0:
            classifierResult = -1
        if (classifierResult != testingLabels[i]): errorCount += 1.0
    errorRateBayes = errorCount / float(numTestVecs)
    print("\nBayes error rate is: %f" % errorRateBayes, "\nerrorCount: ", errorCount)


if choiceModel == 'Logistic' or choiceModel == 'ALL':

    print("----- Logistic -----")

    trainWeights = logRegres.stocGradAscent1(array(trainingSet), trainingLabels, 500)
    # print(trainWeights)

    numTestVecs = testLen
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = logRegres.classifyVector(array(testingSet[i]), trainWeights)
        if classifierResult == 0.0:
            classifierResult = -1.0
        if (classifierResult != testingLabels[i]): errorCount += 1.0
    errorRateLogistic = errorCount / float(numTestVecs)
    print("\nLogistic error rate is: %f" % errorRateLogistic, "\nerrorCount: ", errorCount)

if choiceModel == 'SVM' or choiceModel == 'ALL':

    print("----- SVM -----")

    k1 = 1.3
    # 常数C 200，容错率 0.0001，取消最大循环次数 5000
    b, alphas = svmMLiA.smoP(trainingSet, trainingLabels, 200, 0.0001, 5000, ('rbf', k1))  # C=200 important
    datMat = mat(trainingSet)
    labelMat = mat(trainingLabels).transpose()
    svInd = nonzero(alphas.A > 0)[0]
    sVs = datMat[svInd]  # get matrix of only support vectors
    labelSV = labelMat[svInd]
    # print("there are %d Support Vectors" % shape(sVs)[0])
    m, n = shape(datMat)
    errorCount = 0
    for i in range(m):
        kernelEval = svmMLiA.kernelTrans(sVs, datMat[i, :], ('rbf', k1))
        predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b
        if sign(predict) != sign(trainingLabels[i]): errorCount += 1
    print("SVM: the training error rate is: %f" % (float(errorCount) / m))
    errorCount = 0
    datMat = mat(testingSet)
    labelMat = mat(testingLabels).transpose()
    m, n = shape(datMat)
    for i in range(m):
        kernelEval = svmMLiA.kernelTrans(sVs, datMat[i, :], ('rbf', k1))
        predict = kernelEval.T * multiply(labelSV, alphas[svInd]) + b
        if sign(predict) != sign(testingLabels[i]): errorCount += 1
    errorRateSVM = (float(errorCount) / m)
    print("\nSVM error rate is: %f" % errorRateSVM, "\nerrorCount: ", errorCount)


if choiceModel == 'Adaboost' or choiceModel == 'ALL':
    print("----- Adaboost -----")
    classiA, aggClass = adaboost.adaBoostTrainDS(trainingSet, trainingLabels, 20)
    prediction = adaboost.adaClassify(testingSet, classiA)
    err = mat(ones((testLen, 1)))
    errorCount = (err[prediction != mat(testingLabels).T].sum())
    errorRateAdaboost = (float(errorCount) / testLen)
    print("\nAdaboost error rate is: %f" % errorRateAdaboost, "\nerrorCount: ", errorCount)


# 生成随机颜色
def randomcolor():
    colorArr = ['1','2','3','4','5','6','7','8','9','A','B','C','D','E','F']
    color = ""
    for i in range(6):
        color += colorArr[random.randint(0,14)]
    return "#"+color


if choiceModel == 'ALL':
    # 画矩阵图
    # fig1 = plt.figure()
    # ax1 = fig1.add_subplot(111, aspect='equal')
    # ax1.add_patch(plt.Rectangle((0, 0),10,errorRateKNN*100,color=randomcolor()))
    # ax1.add_patch(plt.Rectangle((15, 0),10,errorRateTree*100,color=randomcolor()))
    # ax1.add_patch(plt.Rectangle((30, 0),10,errorRateBayes*100,color=randomcolor()))
    # ax1.add_patch(plt.Rectangle((45, 0),10,errorRateLogistic*100,color=randomcolor()))
    # ax1.add_patch(plt.Rectangle((60, 0),10,errorRateSVM*100,color=randomcolor()))
    # ax1.add_patch(plt.Rectangle((75, 0),10,errorRateAdaboost*100,color=randomcolor()))
    # plt.xlim(0, 100)
    # plt.ylim(0, 100)
    # plt.show()

    name_list = ['KNN', 'Tree', 'Bayes', 'Logistic', 'SVM', 'Adaboost']
    num_list = [errorRateKNN, errorRateTree, errorRateBayes, errorRateLogistic, errorRateSVM, errorRateAdaboost]
    plt.bar(range(len(num_list)), num_list, color='rgb', tick_label=name_list)
    plt.show()

香槟酒气满天飞

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
《机器学习实战中文版》机器学习分类学习器性能测试

机器学习分类学习器性能测试1. 数据部分从文件中读取数据特征为马的某些医院检测的指标标签为是否属于疝气病症在本次测试中只选取了部分特征list格式数据2. 学习器部分K-近邻居算法决策树朴素贝叶斯Logistic回归支持向量机集成学习# 选择学习器# KNN, Tree, Bayes, Logistic, SVM, Adaboost, ALLchoiceModel = 'ALL'3. 结果学习器对于分类的错误率4. 全部代码分类器是机器学习实战中文版
复制链接

扫一扫