【机器学习】k近邻算法

最新推荐文章于 2024-05-20 08:39:09 发布

AllBull

最新推荐文章于 2024-05-20 08:39:09 发布

阅读量214

点赞数

分类专栏：人工智能文章标签：机器学习近邻算法 sklearn

本文链接：https://blog.csdn.net/weixin_48328037/article/details/124850926

版权

人工智能专栏收录该内容

4 篇文章 0 订阅

订阅专栏

# -*- coding: utf-8 -*-
"""
Created on Wed May 18 16:59:12 2022

@author: 1252319301
"""

# In[1]
# 导入依赖
import numpy as np
import pandas as pd

# In[2]
# 直接引入sklearn中的数据集iris 鸢尾花
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split # 切分数据集为训练集、测试集
from sklearn.metrics import accuracy_score # 用来计算分类预测的准确率

# In[3]
# 导入数据
iris = load_iris()
# print(type(iris))   #class 'sklearn.utils.Bunch'
# print(iris.keys())  #dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
#df['species_num'] = iris.target
df['species'] = iris.target_names[iris.target]
#print(df)
#print(df.describe())

# 处理数据
x = iris.data
y = iris.target.reshape(-1,1)   # 将一维 转 二维

# In[4]
# =============================================================================
# 划分数据集
# =============================================================================
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=35,stratify=y)  #stratify 按照层级等比例分类
#print(x_test.shape,y_test.shape)    #(45, 4) (45, 1)
#print(x_train.shape,y_train.shape)  #(105, 4) (105, 1)
#print(type(x_test))     # <class 'numpy.ndarray'>

#print(np.abs(x_train-x_test[0]))   #(4,)
#print(x_test[0])   #[5.8 2.7 5.1 1.9]
#print(x_test[0].reshape(1,-1))    #[[5.8 2.7 5.1 1.9]] 
#print(x_test[0].reshape(1,-1).shape)    #(1, 4)

# In[5]
# 距离函数定义 a,b 向量(x_test - x_train) x_test 是
# x_test （在np处理时） 必须是一维向量, x_train 可以是矩阵
# 曼哈顿距离
def l1_distance(a,b):
    return np.sum(np.abs(a-b),axis=1)   # 结果保存一列,不加axis会累计为一个值，不是一列
# 欧式距离
def l2_distance(a,b):
    return np.sqrt( np.sum( (a-b)**2, axis=1 ) )

# In[6]
# =============================================================================
# 核心算法
# =============================================================================
# 分类器 ,继承object类
class kNN(object):
    # 定义构造器(k，近邻 和 距离函数)
    def __init__(self, n_neighbors = 1, dist_func = l1_distance):
        self.n_neighbors = n_neighbors
        self.dist_func = dist_func
        
    # 训练模型方法 (knn，训练的过程:无，只传入训练集，然后根据训练集计算距离即可)
    def fit(self, x, y):
        self.x_train = x
        self.y_train = y
    
    # 模型预测方法
    def predict(self, x):
        # 初始化预测分类数组 0数组 ,形状和 类型
        y_pred = np.zeros((x.shape[0], 1), dtype=self.y_train.dtype)
        
        # 遍历输入的x
        for i,x_test in enumerate(x):
            # 计算 测试数据与 所有训练数据 距离
            distances = self.dist_func(self.x_train, x_test)
            # 给距离排序, 取出 索引值   argsort()排序完序的原来的下标值
            '''
                dist = np.array([3,2,1])
                print(np.argsort(dist))
                [2,1,0]
            '''
            nn_index = np.argsort(distances)
            # 选择最近的k个点，保存分类类别   ravel 二维转一维
            nn_y = self.y_train[nn_index[:self.n_neighbors]].ravel()
            # 统计类别出现频率最高的，赋给y_pred[i]  bincount() 统计每个值出现的次数
            '''
                dist = np.arrar([2,1,0,1,1,2])
                print(np.bincount(dist)) # [1,3,2] ,1个0，3个1， 2个2
                print(np.argmax(np.bincount(dist)))     # 1 ,值最大的下标
            '''
            y_pred[i] = np.argmax(np.bincount(nn_y))
        return y_pred
        
# In[7]
# 测试
# 实例knn
knn = kNN(n_neighbors= 3)
# 训练
knn.fit(x_train, y_train)
# 预测
y_pred = knn.predict(x_test)

# 评估(预测准确率)
accuracy = accuracy_score(y_test,y_pred)
print("预测准确率：",accuracy)
'''
k = 3,l1, 预测准确率： 0.9333333333333333
'''

# In[8]
# 自动化测试
# 实例knn
knn = kNN(n_neighbors= 3)

result_list = []
# 训练
knn.fit(x_train, y_train)

# 距离函数
for p in [1,2]:
    knn.dist_func = l1_distance if p==1 else l2_distance
    # k 奇数选取
    for k in range(1, 10, 2):
        knn.n_neighbors = k
        # 预测
        y_pred = knn.predict(x_test)
        accuracy = accuracy_score(y_test,y_pred)
        result_list.append([k,'l1_distance' if p==1 else 'l2_distance',accuracy])
df = pd.DataFrame(result_list,columns=['k','距离函数','预测准确率'])
print(df)
'''
   k         距离函数     预测准确率
0  1  l1_distance  0.933333
1  3  l1_distance  0.933333
2  5  l1_distance  0.977778
3  7  l1_distance  0.955556
4  9  l1_distance  0.955556
5  1  l2_distance  0.933333
6  3  l2_distance  0.933333
7  5  l2_distance  0.977778
8  7  l2_distance  0.977778
9  9  l2_distance  0.977778
'''

AllBull

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
【机器学习】k近邻算法

# -*- coding: utf-8 -*-"""Created on Wed May 18 16:59:12 2022@author: 1252319301"""# In[1]# 导入依赖import numpy as npimport pandas as pd# In[2]# 直接引入sklearn中的数据集iris 鸢尾花from sklearn.datasets import load_irisfrom sklearn.model_selection import .
复制链接

扫一扫