# -*- coding: utf-8 -*-
"""
Created on Wed May 18 16:59:12 2022
@author: 1252319301
"""
# In[1]
# 导入依赖
import numpy as np
import pandas as pd
# In[2]
# 直接引入sklearn中的数据集iris 鸢尾花
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split # 切分数据集为训练集、测试集
from sklearn.metrics import accuracy_score # 用来计算分类预测的准确率
# In[3]
# 导入数据
iris = load_iris()
# print(type(iris)) #class 'sklearn.utils.Bunch'
# print(iris.keys()) #dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
#df['species_num'] = iris.target
df['species'] = iris.target_names[iris.target]
#print(df)
#print(df.describe())
# 处理数据
x = iris.data
y = iris.target.reshape(-1,1) # 将一维 转 二维
# In[4]
# =============================================================================
# 划分数据集
# =============================================================================
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=35,stratify=y) #stratify 按照层级等比例分类
#print(x_test.shape,y_test.shape) #(45, 4) (45, 1)
#print(x_train.shape,y_train.shape) #(105, 4) (105, 1)
#print(type(x_test)) # <class 'numpy.ndarray'>
#print(np.abs(x_train-x_test[0])) #(4,)
#print(x_test[0]) #[5.8 2.7 5.1 1.9]
#print(x_test[0].reshape(1,-1)) #[[5.8 2.7 5.1 1.9]]
#print(x_test[0].reshape(1,-1).shape) #(1, 4)
# In[5]
# 距离函数定义 a,b 向量(x_test - x_train) x_test 是
# x_test (在np处理时) 必须是一维向量, x_train 可以是矩阵
# 曼哈顿距离
def l1_distance(a,b):
return np.sum(np.abs(a-b),axis=1) # 结果保存一列,不加axis会累计为一个值,不是一列
# 欧式距离
def l2_distance(a,b):
return np.sqrt( np.sum( (a-b)**2, axis=1 ) )
# In[6]
# =============================================================================
# 核心算法
# =============================================================================
# 分类器 ,继承object类
class kNN(object):
# 定义构造器(k,近邻 和 距离函数)
def __init__(self, n_neighbors = 1, dist_func = l1_distance):
self.n_neighbors = n_neighbors
self.dist_func = dist_func
# 训练模型方法 (knn,训练的过程:无,只传入训练集,然后根据训练集计算距离即可)
def fit(self, x, y):
self.x_train = x
self.y_train = y
# 模型预测方法
def predict(self, x):
# 初始化预测分类数组 0数组 ,形状和 类型
y_pred = np.zeros((x.shape[0], 1), dtype=self.y_train.dtype)
# 遍历输入的x
for i,x_test in enumerate(x):
# 计算 测试数据与 所有训练数据 距离
distances = self.dist_func(self.x_train, x_test)
# 给距离排序, 取出 索引值 argsort()排序完序的原来的下标值
'''
dist = np.array([3,2,1])
print(np.argsort(dist))
[2,1,0]
'''
nn_index = np.argsort(distances)
# 选择最近的k个点,保存分类类别 ravel 二维转一维
nn_y = self.y_train[nn_index[:self.n_neighbors]].ravel()
# 统计类别出现频率最高的,赋给y_pred[i] bincount() 统计每个值出现的次数
'''
dist = np.arrar([2,1,0,1,1,2])
print(np.bincount(dist)) # [1,3,2] ,1个0,3个1, 2个2
print(np.argmax(np.bincount(dist))) # 1 ,值最大的下标
'''
y_pred[i] = np.argmax(np.bincount(nn_y))
return y_pred
# In[7]
# 测试
# 实例knn
knn = kNN(n_neighbors= 3)
# 训练
knn.fit(x_train, y_train)
# 预测
y_pred = knn.predict(x_test)
# 评估(预测准确率)
accuracy = accuracy_score(y_test,y_pred)
print("预测准确率:",accuracy)
'''
k = 3,l1, 预测准确率: 0.9333333333333333
'''
# In[8]
# 自动化测试
# 实例knn
knn = kNN(n_neighbors= 3)
result_list = []
# 训练
knn.fit(x_train, y_train)
# 距离函数
for p in [1,2]:
knn.dist_func = l1_distance if p==1 else l2_distance
# k 奇数选取
for k in range(1, 10, 2):
knn.n_neighbors = k
# 预测
y_pred = knn.predict(x_test)
accuracy = accuracy_score(y_test,y_pred)
result_list.append([k,'l1_distance' if p==1 else 'l2_distance',accuracy])
df = pd.DataFrame(result_list,columns=['k','距离函数','预测准确率'])
print(df)
'''
k 距离函数 预测准确率
0 1 l1_distance 0.933333
1 3 l1_distance 0.933333
2 5 l1_distance 0.977778
3 7 l1_distance 0.955556
4 9 l1_distance 0.955556
5 1 l2_distance 0.933333
6 3 l2_distance 0.933333
7 5 l2_distance 0.977778
8 7 l2_distance 0.977778
9 9 l2_distance 0.977778
'''
【机器学习】k近邻算法
最新推荐文章于 2024-05-20 08:39:09 发布