KNN

KNN机器学习实战(代码有修改)

K近邻分类算法

优点:精度高、对异常值不敏感、无数据输入假定

缺点:计算复杂度高、空间复杂度高

适用数据范围:数值型和标称型

归一化数值

def Normalize(dataset):
    minVals = dataset.min(0)
    maxVals = dataset.max(0)
    ranges = maxVals - minVals
    norm_set = dataset - minVals
    norm_set = norm_set / ranges
    return norm_set, ranges, minVals

代码 手写数字识别

import numpy as np
import matplotlib.pyplot as plt
import operator
import os


#inX 用于分类的输入向量
#datase 训练样本
#labels 标签
#k 选择最近邻居的数目

def classify0(inX, dataset, labels, k):  #k最好为单数
    dataset_size = dataset.shape[0]    #计算多样本数
    diffMat = np.tile(inX, (dataset_size, 1)) - dataset  #np.tile(a,(2,1)) [[a];[a]]
    sqDiffMat = diffMat**2
    sqDistance = sqDiffMat.sum(axis=1)
    distances = sqDistance**0.5
    sort_distances = distances.argsort()  #返回最小值的索引 array 数组
    classcount = {}

    for i in range(k):
        votelabel = labels[sort_distances[i]]
        classcount[votelabel] = classcount.get(votelabel,0) + 1  #字典get value 并+1次
    sort_classcount = sorted(classcount.items(),
    key = operator.itemgetter(1),reverse=True)      #oprator.itemgetter(1) {'B':0;'A':1} 按照0 1 排序  False 升序 选取点数最多的点
    return sort_classcount[0][0]

##文本读取 可删除 与手写 无关

def file2matix(filename):
    fr = open(filename)
    arrayOlines = fr.readlines()
    numberOflines = len(arrayOlines)
    returnMat = np.zeros((numberOflines,3))
    classlabelv = []
    index = 0
    for line in arrayOlines:
        line = line.strip()  # 脱掉回车符号 字符串常用
        listFromLine = line.split('\t')  
        returnMat[index,:] = listFromLine[0:3]
        classlabelv.append(listFromLine[-1])
        index += 1
    return returnMat,classlabelv
datingMat, datalabels = file2matix('datingTestSet.txt')

#print(datingMat,datalabels)
#plt.scatter(datingMat[:,1],datingMat[:,2],15.0*np.array(datalabels),15.0*np.array(datalabels))
#plt.show()

def datingClassTest():
    Ratio = 0.1
    dateMat,dataLabels = file2matix('datingTestset.txt')
    normMat, ranges, minVlas = Normalize(dateMat)
    numTest = int(Ratio * normMat.shape[0])
    erro = 0
    for i in range(numTest):
        classResult = classify0(normMat[i],normMat[numTest:],dataLabels[numTest:],2)
        print('the came back with: %s, the real answer is %s'%(classResult,dataLabels[i]))
        if classResult != dataLabels[i]:
            erro +=1.0
    print("total error is %f" % (erro/float(numTest)))


def classPerson():
    results = ['not at all','in small dose','in large doess']
    percents = float(input('percentage of time spent playing video games'))
    ffMlies = float(input("liters filter miles earned per year"))
    iceCrems = float(input("ice cream consumed per year"))
    dataM, datals = file2matix('datingTestSet2.txt')
    normM, ranges, minVlas = Normalize(dataM)
    test_person = np.array([percents,ffMlies,iceCrems])
    Results_P = classify0(test_person,normM,datals,5)
    print("you will probably like this person:",results[int(Results_P) - 1])
    
    
##以下为手写knn数字识别 读取文本数据代码

def img2vector(filename):   
    returnVector = np.zeros((1,1024))   #1*1024 numpy数组
    fr = open(filename)
    for i in range(32):
        lineStr = fr.readline()
        for j in range(32):
            returnVector[0,32*i+j] = int(lineStr[j])  # 32*32 转化为1*1024 数组
    return returnVector

def Handclassy():
    trian_label=[]
    foldername = 'trainingDigits'
    filenames = os.listdir(foldername)
    triandata = np.zeros((len(filenames),1024))
    for i in range(len(filenames)):
        filename = os.path.join(foldername,filenames[i])
        vec=img2vector(filename)
        triandata[i]=vec
        trian_label.append(filename.split('.')[0].split('\\')[-1].split('_')[0])
    foldername2 = 'testDigits'
    filename2s = os.listdir(foldername2)
    error = 0
    for i in range(len(filename2s)):
        filename2 = os.path.join(foldername2,filename2s[i])
        test_vec = img2vector(filename2)
        testlabel=filename2.split('.')[0].split('\\')[-1].split('_')[0]
        HclassResult = classify0(test_vec,triandata,trian_label,3)
        print("the result is %s,actual is %s" %(HclassResult,testlabel))
        if HclassResult != testlabel:
            error += 1.0
    print("Totol numer of error is%f" %(error))
    print(“error:”error/float(len(filename2s)))

if __name__=='__main__':
    Handclassy()

结果

Totol numer of error is10.000000
error:0.010570824524312896
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值