ml1:knn算法

最新推荐文章于 2020-05-29 17:00:27 发布

monk1992

最新推荐文章于 2020-05-29 17:00:27 发布

阅读量200

点赞数

分类专栏： machine learnings算法

本文链接：https://blog.csdn.net/monk1992/article/details/102781254

版权

machine learnings算法专栏收录该内容

3 篇文章 0 订阅

订阅专栏

KNN算法：又称k-近邻算法，是数据分类最为简单有效的算法之一，但是每次使用KNN算法时需要所有的训练样本与每个测试样本进行计算，若训练集较大则占用较多的内存，导致训练和测试的效率不高。
注意：KNN的样本集的每一个样本可能是由多个特征数据组成，这些特征数据的可能各自的数量级不一样这对最终结果影响很大，需要将数据归一化（每个特征的range=max-min），采用距离作为KNN的度量标准时候可以适当的设置各个特征在计算距离时的权重。此外，某些特殊情况可以使用余弦相似度作为KNN的度量标准。

KNN算法实现：
1、单个测试样本数据。2、处理后的训练集数据。3、训练集对应的labels。4、k值（设置取与测试样本最近的K个训练结果，统计最高出现的label即结果）

import numpy as np
import operator
import os
from mpl_toolkits import mplot3d
import matplotlib
import matplotlib.pyplot as plt


def classify0(inX, dataSet, labels, k):
    dataSetSize = dataSet.shape[0]
    diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet
    sqDiffMat = diffMat ** 2
    sqDistances = sqDiffMat.sum(axis=1)
    distances = sqDistances ** 0.5
    sortedDistIndicies = distances.argsort()
    classCount = {}
    for i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]]
        classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
    sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)  # key不可少!!!
    return sortedClassCount[0][0]


def autoNorm(dataSet):
    minVals = dataSet.min(0)
    maxVals = dataSet.max(0)
    ranges = maxVals - minVals
    normDataSet = np.zeros(np.shape(dataSet))
    m = dataSet.shape[0]
    normDataSet = dataSet - np.tile(minVals, (m, 1))
    normDataSet = normDataSet / np.tile(ranges, (m, 1))
    return normDataSet, ranges, minVals


def file2matrix(filename, n=3):
    fr = open(filename)
    arrayOLines = fr.readlines()
    numberOfLines = len(arrayOLines)
    returnMat = np.zeros((numberOfLines, n))  # n=3
    index = 0
    classLabelVector = []
    for line in arrayOLines:
        line = line.strip()  # 默认去除首尾空格，或指定的字符
        listFromLine = line.split('\t')
        returnMat[index, :] = listFromLine[0:3]
        classLabelVector.append(int(listFromLine[-1]))
        index += 1
    fr.close()
    return returnMat, classLabelVector


def datingClassTest():
    hoRatio = 0.10
    datingDataMat, datingLabels = file2matrix(
        r"C:\Users\tycho-wjr\Desktop\数据结构和算法相关\机器学习\机器学习实战中文\MLiA_SourceCode\machinelearninginaction\Ch02\datingTestSet2.txt")
    normMat, ranges, minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]
    numTestVecs = int(m * hoRatio)
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = classify0(normMat[i, :], normMat[numTestVecs:int(m), :], datingLabels[numTestVecs:int(m)], 3)
        print("the classifier came back with: %d,the real answer is :%d" % (classifierResult, datingLabels[i]))
        if (classifierResult != datingLabels[i]): errorCount += 1.0
    print("the total error rate is:%f" % (errorCount / float(numTestVecs)))


def classifyperson():
    resultList = ["not at all", "in small doses", "in large doses"]
    percentTats = float(input("percentage if time spent playing video games?"))
    ffMiles = float(input("frequent flier miles earned per year?"))
    iceCream = float(input("liters of icecream consumed per year?"))
    datingDataMat, datingLabels = file2matrix(
        r"C:\Users\tycho-wjr\Desktop\数据结构和算法相关\机器学习\机器学习实战中文\MLiA_SourceCode\machinelearninginaction\Ch02\datingTestSet2.txt")
    normMat, ranges, minVals = autoNorm(datingDataMat)
    inArr = [percentTats, ffMiles, iceCream]
    classifiResult = classify0((inArr - minVals) / ranges, datingDataMat, datingLabels, 3)
    print("you will probably like this person:", resultList[classifiResult - 1])


def img2vector(filename):
    returnVect = np.zeros((1, 1024))
    fr = open(filename)
    for i in range(32):
        line = fr.readline()
        for j in range(32):
            returnVect[0, i * 32 + j] = int(line[j])
    fr.close()
    return returnVect


def handwritingClassTest(filepath1, filepath2):
    train_files = os.listdir(filepath1)
    train_labels = []
    train_imgs = np.zeros((len(train_files), 1024))
    for i in range(len(train_files)):
        train_imgs[i] = img2vector(filepath1 + "/" + train_files[i])
        train_labels.append(int(train_files[i].split("_")[0]))

    test_files = os.listdir(filepath2)
    test_labels = []
    test_img = np.zeros((1, 1024))
    error = 0
    for j in range(len(test_files)):
        test_img = img2vector(filepath2 + "/" + test_files[j])
        test_labels.append(int(test_files[j].split("_")[0]))
        test_result = classify0(test_img, train_imgs, train_labels, 3)
        print("image %d is probably %d ,its label is %d" % (j, test_result, test_labels[j]))
        if (test_labels[j] != test_result): error += 1
    print("total images is %d ,total error images is %d" % (len(test_files), error))
    print("total error rate is %f" % (error / float(len(test_files))))

testing_path=r"C:\Users\tycho-wjr\Desktop\数据结构和算法相关\机器学习\机器学习实战中文\MLiA_SourceCode\machinelearninginaction\Ch02\digits\testDigits"
training_path=r"C:\Users\tycho-wjr\Desktop\数据结构和算法相关\机器学习\机器学习实战中文\MLiA_SourceCode\machinelearninginaction\Ch02\digits\trainingDigits"
handwritingClassTest(training_path,testing_path)


#datingClassTest()

#classifyperson()

"""
fig = plt.figure()
ax = mplot3d.Axes3D(fig)
#plt.xlabel("x air caption")
#plt.ylabel("y game caption")
#plt.zlabel("z ice-cream caption")
ax.scatter(xs=datingDataMat[:, 0], ys=datingDataMat[:, 1], zs=datingDataMat[:, 2], s=15 * np.array(datingDataLabel),
           c=15 * np.array(datingDataLabel))
plt.show()
"""

monk1992

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
ml1:knn算法

KNN算法：又称k-近邻算法，是数据分类最为简单有效的算法之一，但是每次使用KNN算法时需要所有的训练样本与每个测试样本进行计算，若训练集较大则占用较多的内存，导致训练和测试的效率不高。注意：KNN的样本集的每一个样本可能是由多个特征数据组成，这些特征数据的可能各自的数量级不一样这对最终结果影响很大，需要将数据归一化（每个特征的range=max-min），采用距离作为KNN的度量标准时候可以适...
复制链接

扫一扫