ML_KNN实现分类

最新推荐文章于 2024-09-26 09:20:23 发布

Davidmvp

最新推荐文章于 2024-09-26 09:20:23 发布

阅读量801

点赞数

本文链接：https://blog.csdn.net/Davidmvp/article/details/88879615

版权

1.什么是KNN

K近邻算法（K-Nearest Neighbour，K-NN）是一种基本分类与回归方法，是一个理论上比较成熟的方法，也是最简单的机器学习算法之一。该方法的思路是：如果一个样本在特征空间中的K个最相似（即特征空间中最邻近）的样本中的大多数属于某一个类别，则该样本也属于这个类别。
K近邻算法简单，直观。给定一个训练数据集，对于新的输入实例，在训练数据集中找到与该实例最邻近的K个实例，这K个实例的多数某个类，就把该输入实例归入这个类。K近邻法没有显示的学习过程。

2.算法步骤

step.1---初始化距离为最大值
step.2---计算未知样本和每个训练样本的距离dist
step.3---得到目前K个最临近样本中的最大距离maxdist
step.4---如果dist小于maxdist，则将该训练样本作为K-最近邻样本
step.5---重复步骤2、3、4，直到未知样本和所有训练样本的距离都算完
step.6---统计K个最近邻样本中每个类别出现的次数
step.7---选择出现频率最大的类别作为未知样本的类别

实例1-Iris 数据集


import numpy as np
import operator
'''
            样本(m) × 特征(n) 
    x1 x2 x3 x4 .... xn  
1
2
...
m
    m个样本的标签(1 * m) 二维矩阵
1 2 3 ... m
    测试数据的特征(1 * n)二维矩阵
x1 x2 x3 x4 ... xn
尽管是一维的矩阵，但是写成二维的便于统一格式
'''
 
def Manhattan(dataSet,testdata):
    m = np.shape(dataSet)[0]
    distance = np.zeros(m)
    for i in range(m):
        distance[i] = np.sqrt( np.sum(np.abs(dataSet[i,:] - testdata[0,:])) )
    return distance
 
def Euclidean(dataSet,testdata):
    m = np.shape(dataSet)[0]
    distance = np.zeros(m)
    for i in range(m):
        distance[i] = np.sqrt( np.sum( np.power( dataSet[i,:] - testdata[0,:] , 2)  )  )
    return distance
 
def Cosine(dataSet,testdata):
    m = np.shape(dataSet)[0]
    distance = np.zeros(m)
    for i in range(m):
        # \ 换行符 分行写代码
        #原值取负，距离值越小相似度越高
        distance[i] = - ( np.sum(dataSet[i,:] * testdata[0,:] ) )  / ( np.sqrt(np.sum(  \
            np.power(dataSet[i,:],2))) *np.sqrt(np.sum(np.power(testdata[0,:],2))) )        
    return distance
 
def Pearson(dataSet,testdata):
    m = np.shape(dataSet)[0]
    n = np. shape(dataSet)[1]
    distance = np.zeros(m)
    ave_t = np.sum(testdata[0,:]) / n
    for i in range(m):
        ave_d = np.sum(dataSet[i,:]) / n
        #原值取负，距离值越小相似度越高
        distance[i] = - np.abs(  np.sum((testdata[0,:] - ave_t) * (dataSet[i,:] - ave_d)) / np.sqrt  \
        ( np.sum(np.power(testdata[0,:]- ave_t,2))* np.sum(np.power(dataSet[i,:] - ave_d,2))) )
    return distance
 
def Jaccard():
    return
 
 
#训练集,测试数据,训练集标签,距离方式，k值
def Knn(dataSet,dataSet_label,testdata,distance_way,k):
    #dist_sort是disance的升序排列的索引列表 
    dist_sort = distance_way(dataSet, testdata).argsort()
    #创建一个{类别：频数}的字典    
    classcount = {}
    #遍历前k个样本
    for i in range(k):
        label = dataSet_label[0,dist_sort[i]]
        classcount[label] = classcount.get( label , 0 ) + 1
    #classcount_sort为classcount按照  '值'  序列的列表
    classcount_sort = sorted(classcount.items(), key = operator.itemgetter(1),reverse = True)
    #返回频数最高对应的标签
    return classcount_sort[0][0]
 
def get_data(txt_name):
    #计算txt文件的行数
    count = len(open(txt_name).readlines())
    #创建矩阵
    data = np.zeros(shape = (count,4))
    label = np.zeros(shape = (count,1))
    label = label.astype(str)
    i = 0
    file = open(txt_name)
    for line in file.readlines():
        #strip()消除行首和行尾的空白
        #split()以参数为间隔符,提取数据,返回字符串列表
        cutline = line.strip().split(',')
        data[i,:]= np.array(cutline[0:-1],dtype = float)
        label[i,0] = cutline[-1]
        i += 1
    return data,label.T
'''
---------------------------------------------------begin----------------------------------------------
'''
a = get_data('Iris.txt')
dataSet,dataSet_label = a[0],a[1]
testdata = np.array([
                                [5.4,3.7,1.5,0.2]   
                                ],dtype = float)
print(Knn(dataSet,dataSet_label,testdata,Manhattan,10))

实例2-手写数字识别

数据集来源

https://www.jianshu.com/p/3d0bba113fd6

用的是前两个数据

实现代码：

import struct
import numpy as np
import matplotlib.pyplot as plt
import operator

def Manhattan(dataSet,testdata):
    m = np.shape(dataSet)[0]
    distance = np.zeros(m)
    for i in range(m):
        distance[i] = np.sqrt( np.sum(np.abs(dataSet[i,:] - testdata[0,:])) )
    return distance

def Euclidean(dataSet,testdata):
    m = np.shape(dataSet)[0]
    distance = np.zeros(m)
    for i in range(m):
        distance[i] = np.sqrt( np.sum( np.power( dataSet[i,:] - testdata[0,:] , 2)  )  )
    return distance

def Cosine(dataSet,testdata):
    m = np.shape(dataSet)[0]
    distance = np.zeros(m)
    for i in range(m):
        # \ 换行符 分行写代码
        #原值取负，距离值越小相似度越高
        distance[i] = - ( np.sum(dataSet[i,:] * testdata[0,:] ) )  / ( np.sqrt(np.sum(  \
            np.power(dataSet[i,:],2))) *np.sqrt(np.sum(np.power(testdata[0,:],2))) )
    return distance

def Pearson(dataSet,testdata):
    m = np.shape(dataSet)[0]
    n = np. shape(dataSet)[1]
    distance = np.zeros(m)
    ave_t = np.sum(testdata[0,:]) / n
    for i in range(m):
        ave_d = np.sum(dataSet[i,:]) / n
        #原值取负，距离值越小相似度越高
        distance[i] = - np.abs(  np.sum((testdata[0,:] - ave_t) * (dataSet[i,:] - ave_d)) / np.sqrt  \
        ( np.sum(np.power(testdata[0,:]- ave_t,2))* np.sum(np.power(dataSet[i,:] - ave_d,2))) )
    return distance
def Knn(dataSet,dataSet_label,testdata,distance_way,k):
    #dist_sort是disance的升序排列的索引列表
    dist_sort = distance_way(dataSet, testdata).argsort()
    #创建一个{类别：频数}的字典
    classcount = {}
    #遍历前k个样本
    for i in range(k):
        label = dataSet_label[0,dist_sort[i]]
        classcount[label] = classcount.get( label , 0 ) + 1
    #classcount_sort为classcount按照  '值'  序列的列表
    classcount_sort = sorted(classcount.items(), key = operator.itemgetter(1),reverse = True)
    #返回频数最高对应的标签
    return classcount_sort[0][0]


# 读入mnist文件
def load_mnist(label_path, image_path):
    with open(label_path, 'rb') as lbpath:
        magic, n = struct.unpack('>II', lbpath.read(8))
        label = np.fromfile(lbpath, dtype=np.uint8)

    with open(image_path, 'rb') as imgpath:
        magic, num, rows, cols = struct.unpack('>IIII', imgpath.read(16))
        image = np.fromfile(imgpath, dtype=np.uint8).reshape(len(label), 784)
    # 60000个训练集及其标签
    # label 60000   一维矩阵
    # image 60000 * 784 (784 = 28 *28)
    return label, image


'''
--------------------------------begin--------------------------------------------
'''
a = load_mnist('train-labels.idx1-ubyte', 'train-images.idx3-ubyte')
# 一维换二维，统一格式
dataSet_label, dataSet = a[0].reshape(1, 60000), a[1]

# 测试数据，采用了训练集的第57个样本
testdata = dataSet[8999, :].reshape(1, 784)
# k值选取10
print(Knn(dataSet, dataSet_label, testdata, Euclidean, 10))

# 将测试数据显示
plt.imshow(testdata.reshape(28, 28), cmap='Greys')
plt.show()