第二章k邻近算法(机器学习实战)

第二章k邻近算法(机器学习实战)

本章主要介绍k邻近算法的基本原理,并附上测试用例代码。

算法介绍

k邻近算法介绍

测试代码

测试一

简单的算法测试代码,kNN_1.py。

from numpy import *

def createDataSet():
    group = array([[1.0, 1.1], [1.0, 1.0], [0.0, 1.0], [0.0, 1.1]])
    label = array(['A', 'A', 'B', 'B'])
    return group, label

def classify(input, dataset, labels, k):
    size = dataset.shape[0]
    # tile 转换成矩阵计算
    diffmat = tile(input, (size, 1)) - dataset
    sqdiffmat = diffmat**2
    sqdistance = sqdiffmat.sum(axis=1)
    distances = sqdistance**0.5
    # 获取降序排序的索引值
    sortedindex = distances.argsort()

    classcount = {}
    for i in range(k):
        votelabel = labels[sortedindex[i]]
        classcount[votelabel] = classcount.get(votelabel, 0) + 1

    # 采用sorted对字典排序
    sortedclasscount = sorted(classcount.items(), key=lambda item:item[1], reverse=True)
    return sortedclasscount[0][0]

if __name__ == '__main__':
    dataset, label = createDataSet()
    preclass = classify([1.0, 1.2], dataset, label, 3)
    print(preclass)

测试二

约会网站配对示例代码,kNN_2.py。

from numpy import *
import matplotlib
import matplotlib.pyplot as plt
from kNN_1 import *

def file2Matrix(filepath):
    lines = open(filepath).readlines()
    linenum = len(lines)
    matrix = zeros((linenum, 3))
    labels = []
    for i, line in enumerate(lines):
        strs = line.strip().split("\t")
        matrix[i, :] = [float(str) for str in strs[:3]]
        labels.append(int(strs[-1]))

    return matrix, labels

def datasetAnalysis(dataset, labels):
    # 创建图
    fig = plt.figure()
    # 设置属性参数行数,列数,第几张图
    ax = fig.add_subplot(111)
    # 画散点图,并上色
    ax.scatter(dataset[:, 0], dataset[:, 1], 15.0 * array(labels), 15.0 * array(labels))
    plt.show()

def autoNorm(dataset):
    minvals = dataset.min(axis=0)
    maxvals = dataset.max(axis=0)
    ranges = maxvals - minvals
    minmatrix = tile(minvals, (dataset.shape[0], 1))
    rangematrix = tile(ranges, (dataset.shape[0], 1))
    returnmatrix = (dataset - minmatrix) / rangematrix
    return returnmatrix, ranges, minvals

def datasetTest(dataset, labels):
    testratio = 0.10
    allnum = dataset.shape[0]
    testnum = int(allnum * testratio)
    error = 0
    for i in range(testnum):
        res = classify(dataset[i], dataset[testnum:allnum], labels[testnum:allnum], 10)
        if res != labels[i]:
            print("pre: ", res, "======> gt: ", labels[i])
            error += 1
    print("error: ", error, ", ratio: ", error / testnum)

if __name__ == '__main__':
    # 获取数据
    matrix, label = file2Matrix("/home/tgj/machinelearninginaction/Ch02/datingTestSet2.txt")
    print(matrix, label)

    # 数据的分析
    # datasetAnalysis(matrix, label)

    # 数据的归一化
    normmatrix, _, _ = autoNorm(matrix)

    # 数据测试
    datasetTest(normmatrix, label)

测试三

minist手写字体识别示例,kNN_3.py

from numpy import *
import matplotlib, os
import matplotlib.pyplot as plt
from kNN_1 import *

def img2Vector(digit_dir):
    files = os.listdir(digit_dir)
    labels = []
    returnmatrix = zeros((len(files), 1024))
    for row, file in enumerate(files):
        label = file.split("_")[0]
        lines = open(os.path.join(digit_dir, file)).readlines()
        for i, line in enumerate(lines):
            strs = line.strip()
            vals = array([int(val) for val in strs])
            returnmatrix[row, i*32:(i+1)*32] = vals
        labels.append(int(label))

    return returnmatrix, labels

def datasetTest(testpath, dataset, labels):
    files = os.listdir(testpath)
    error = 0
    testnum = len(files)
    for file in files:
        lines = open(os.path.join(testpath, file)).readlines()
        label = file.split("_")[0]
        testvec = zeros((1, 1024))
        for i, line in enumerate(lines):
            strs = line.strip()
            vals = array([int(val) for val in strs])
            testvec[0, i * 32:(i + 1) * 32] = vals

        pre = classify(testvec, dataset, labels, 10)
        if pre != int(label):
            error += 1
            print("pre: ", pre, " ===> gt: ", label)

    print("error: ", error, ", ratio: ", error / testnum)

if __name__ == '__main__':
    # 数据的准备
    returnmatrix, label = img2Vector("/home/tgj/machinelearninginaction/Ch02/digits/trainingDigits")

    # 数据的测试
    testdir = "/home/tgj/machinelearninginaction/Ch02/digits/trainingDigits"
    datasetTest(testdir, returnmatrix, label)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

黑马水牛

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值