机器学习之KNN算法(二)

import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier

加载样本数据

  • matplotlib读取图片
  • 将每一张图片转化为一组数组

在这里插入图片描述

bmp1 = plt.imread('digits/0/0_1.bmp')
# 查看形状
display(bmp1.shape)
# 显示
plt.imshow(bmp1)
(28, 28)





<matplotlib.image.AxesImage at 0x5d80c50>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-wPyO982s-1650167688362)(output_2_2.png)]

# 改变显示的颜色
bmp1 = plt.imread('digits/0/0_1.bmp')
# 查看形状
display(bmp1.shape)
# 显示
plt.imshow(bmp1,cmap='gray')
(28, 28)





<matplotlib.image.AxesImage at 0x5dfdeb0>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-qURvt6vA-1650167688364)(output_3_2.png)]

尝试将两张图片转化为2行 28*28列

bmp2 = plt.imread('digits/0/0_2.bmp')
digits =[]
# 把二维数组变成一维数组
digits.append(bmp1.ravel())  # bmp1.reshape(-1)
digits.append(bmp2.ravel())

digits = np.array(digits)
digits.shape
(2, 784)

尝试将数组中某一张图片进行显示

plt.imshow(digits[0].reshape(28,28),cmap='gray')
<matplotlib.image.AxesImage at 0x4ca1e90>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-iBLhAv9j-1650167688367)(output_7_1.png)]

批量化读取所有的图片

  • 样式数据:每一张图片的一维数组
  • 标签:文件目录文件
data = []  # 完成的目标 (5000,784)
target = [] # 完成的目标(5000,)

for label in range(10):
    for index in range(1,501):
        bmp_filename = f'digits/{label}/{label}_{index}.bmp'
        bmp = plt.imread(bmp_filename)
        data.append(bmp.ravel())
        target.append(label)
        
# 将list转化为ndarray
data = np.array(data)
target = np.array(target)

display(data.shape,target.shape)
(5000, 784)



(5000,)

创建分类模型

  • KNN 分类
  • 邻近数量:5,7,9,11
  • weights: uniform,distance # 权重
knn = KNeighborsClassifier(n_neighbors=7)
"""
KNeighborsClassifier(
    n_neighbors=5,
    weights='uniform',
    algorithm='auto',
    leaf_size=30,
    p=2,
    metric='minkowski',
    metric_params=None,
    n_jobs=None,
    **kwargs,
)
"""
from sklearn.model_selection import train_test_split as split
X_train,X_test,y_train,y_test = split(data,target,test_size=0.2)

拆分数据的训练

knn.fit(X_train,y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='uniform')
knn.score(X_test,y_test)
0.93
# n_neighbors=5
knn.set_params(n_neighbors=5)
knn.fit(X_train,y_train)
knn.score(X_test,y_test)
0.929
# n_neighbors=9
knn.set_params(n_neighbors=9)
knn.fit(X_train,y_train)
knn.score(X_test,y_test)
0.922
knn.set_params(n_neighbors=9,weights='distance')
knn.fit(X_train,y_train)
knn.score(X_test,y_test)
0.926
knn.set_params(n_neighbors=11)
knn.fit(X_train,y_train)
knn.score(X_test,y_test)
0.917

重新训练全部样式

knn.fit(data,target)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=11, p=2,
                     weights='distance')
import os

加载测试样本

test1 = plt.imread('digits/test/4.bmp')
test1.shape
(28, 28, 3)
plt.imshow(test1.mean(axis=-1))
<matplotlib.image.AxesImage at 0xada3b30>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Bd0Hrtzs-1650167688370)(output_26_1.png)]

# 加灰度 
plt.imshow(test1.mean(axis=-1),cmap='gray')
<matplotlib.image.AxesImage at 0xaddf710>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-y2Cp272r-1650167688372)(output_27_1.png)]

test_data = []
test_target = []
for filename in os.listdir('digits/test'):
    print(filename)
    bmp_file = f'digits/test/{filename}'
    bmp =plt.imread(bmp_file)
    # bmp是三维数组(28,28,3) 最后一个维度的数据(3)是颜色值。 (r,g,b)
    # 对最内层的维度进行平均计算,得出一个单颜色值(0-255),即降维
    test_data.append(bmp.mean(axis=-1).ravel())
    
    # os.path.split() 分隔目录与文件名
    # os.path.splitext() 将文件名的名称和扩展名分隔出来
    
    label,ext_name = os.path.splitext(filename)
    test_target.append(label)
    
test_data = np.array(test_data)
test_target = np.array(test_target)

display(test_data.shape,test_target.shape)
4.bmp
5.bmp
6.bmp
7.bmp
8.bmp
9.bmp



(6, 784)



(6,)
test_target
array(['4', '5', '6', '7', '8', '9'], dtype='<U1')
test_data[0].shape
(784,)
test_data
array([[255., 255., 255., ..., 255., 255., 255.],
       [255., 255., 255., ..., 255., 255., 255.],
       [255., 255., 255., ..., 255., 255., 255.],
       [255., 255., 255., ..., 255., 255., 255.],
       [255., 255., 255., ..., 255., 255., 255.],
       [255., 255., 255., ..., 255., 255., 255.]])

预测测试数据集的结果

y_ = knn.predict(test_data)
y_
array([4, 4, 5, 1, 0, 5])

可视化的方式显示预测结果

plt.figure(figsize=(10,12))
for i,test_bmp in enumerate(test_data):
    # 两行三列
    plt.subplot(2,3,i+1)
    plt.imshow(test_bmp.reshape(28,28),cmap='gray')
    plt.title(f'True:{test_target[i]} Pred:{y_[i]}',size=20)
    plt.axis('off')   # 关闭坐标轴
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-sI8fL50H-1650167688376)(output_36_0.png)]

读取手写的数字

  • filename: digits/test/4_2.bmp
    在这里插入图片描述
    在这里插入图片描述
test_4_2 = plt.imread('digits/test/4_2.bmp')
test_4_2.shape
(28, 28, 4)
y_2  =knn.predict(test_4_2.mean(axis=-1).reshape(1,-1))
display(y_2)
array([1])
from pandas import Series
D:\yingyong\Anaconda3\lib\importlib\_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 112 from C header, got 124 from PyObject
  return f(*args, **kwds)
Series(test_4_2.ravel()).unique()
array([255,   0], dtype=uint64)
Series(bmp1.ravel()).unique()
array([255, 204,  96,   2, 205, 207,  17,   3,  18, 201,  28,  16,  22,
       198, 249, 245, 195,  31,  53, 171, 133,  92, 159,  66,  88,  65,
       141,  27, 208, 176,   0,  87,  76, 243, 180, 134, 234,  12, 217,
        90,  47, 248,  77,  15, 184, 236, 227,  60, 192,  57,  59, 179,
         9, 143, 107, 170,  25, 230, 120,  69,  32, 124,  30, 110,  82,
       169,  93,   6, 109, 226, 199,  26,  40, 125,  56, 127, 114, 218],
      dtype=uint64)

随机从训练集中抽取5000个数字图片

index = np.arange(5000)
index
array([   0,    1,    2, ..., 4997, 4998, 4999])
np.random.permutation(index)# 随机打乱
array([4660, 2679, 3913, ...,  525, 3990,  146])
# 测试50个
test_index = np.random.randint(5000,size=50)
data[test_index].shape
(50, 784)
y_ = knn.predict(data[test_index])
target[test_index]
array([3, 3, 7, 8, 4, 7, 4, 3, 7, 3, 6, 5, 9, 7, 1, 8, 8, 2, 1, 5, 6, 2,
       5, 7, 9, 3, 0, 3, 9, 2, 6, 0, 0, 6, 0, 8, 6, 7, 6, 2, 1, 0, 4, 9,
       9, 8, 5, 0, 4, 5])
y_
array([3, 3, 7, 8, 4, 7, 4, 3, 7, 3, 6, 5, 9, 7, 1, 8, 8, 2, 1, 5, 6, 2,
       5, 7, 9, 3, 0, 3, 9, 2, 6, 0, 0, 6, 0, 8, 6, 7, 6, 2, 1, 0, 4, 9,
       9, 8, 5, 0, 4, 5])
test_label = target[test_index]
test_label[test_label == y_].size / test_label.size
1.0
test_index = np.random.randint(5000,size=50)
y_ = knn.predict(data[test_index])
test_label = target[test_index]
test_label[test_label == y_].size / test_label.size
1.0
test_index = np.random.randint(1000,size=50)
y_ = knn.predict(data[test_index])
test_label = target[test_index]
test_label[test_label == y_].size / test_label.size
1.0
plt.figure(figsize=(30,20))
i =1 
for index in test_index:
    plt.subplot(5,10,i)
    
    plt.imshow(X_test[index].reshape(28,28),cmap='gray')
    plt.axis('off')
    plt.title(f'T:{y_test[index]} P:{y_[i-1]}',size=30)
    i += 1
    
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-w3lOflJn-1650167688380)(output_54_0.png)]

通过mean()方式实现降维效果
a = np.array(
    [
        [
            [3,4,5],
            [5,6,7],
            [3,6,9]
        ],
        [
            [1,2,5],
            [3,5,7],
            [6,8,9]
        ]
    ]
)
a.shape   # 两个,三行三列
(2, 3, 3)
# 2 --->(3+1)/ 2       3 --->(4+2)/ 2       5 --->(5+5)/ 2    
# 4 --->(5+3)/ 2       5.5 --->(6+5)/ 2     7 --->(7+7)/ 2
# 4.5 --->(3+6)/ 2     7 --->(6+8)/ 2       9--->(9+9)/ 2
a.mean(axis=0)
array([[2. , 3. , 5. ],
       [4. , 5.5, 7. ],
       [4.5, 7. , 9. ]])
# 3.66666667 --->(3+5+3)/ 3    5.33333333  --->(4+6+6)/ 3    7.  --->(5+7+9)/ 3
# 3.33333333 --->(1+3+6)/ 3    5.  --->(2+5+8)/ 3    7.  --->(5+7+9)/ 3
a.mean(axis=1)
array([[3.66666667, 5.33333333, 7.        ],
       [3.33333333, 5.        , 7.        ]])
# 4. --->(3+4+5)/ 3             6.  --->(5+6+7)/ 3      6.  --->(3+6+9)/ 2 
# 2.66666667 --->(1+2+5)/ 3     5.--->(3+5+7)/ 3        7.66666667 --->(6+8+9)/ 2 

a.mean(axis=-1)
array([[4.        , 6.        , 6.        ],
       [2.66666667, 5.        , 7.66666667]])
  • 2
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
鸢尾花数据集-数据分析 from sklearn import datasets import pandas as pd import matplotlib.pyplot as plt import numpy as np # 获取鸢尾花数据集 lris_df = datasets.load_iris() # 输⼊特征 lris_df.data # ⽬标特征 lris_df.target data_DF = pd.DataFrame(lris_df.data) target_DF = pd.DataFrame(lris_df.target) # dataframe按列拼接 join_DF = pd.concat([data_DF,target_DF],axis=1) # 修改列名 join_DF.columns=['sepal-length','sepal-width','petal-length','petal-width','class'] # 查看数据分布 x_axis = lris_df.data[:,0] y_axis = lris_df.data[:,2] plt.scatter(x_axis,y_axis,c=lris_df.target) plt.show() # 输⼊特征直⽅图分布 join_DF.iloc[:,0:4].hist() plt.show() # 箱线图 join_DF.iloc[:,0:4].plot(kind='box',subplots=True,layout=(2,2),sharex=False,sharey=False) plt.show() ft_DF = join_DF.iloc[:,0:4] # 相关系数 ft_DF.corr() x_val=ft_DF['petal-width'] y_val=ft_DF['petal-length'] plt.scatter(x_val,y_val) data_array = join_DF.values from sklearn import model_selection # 数据集划分 X = data_array[:,0:4] Y = data_array[:,4] validation_size = 0.2 seed = 6 X_train,X_validation,Y_train,Y_validation = model_selection.train_test_split(X,Y,test_size=validation_size,random_state=seed) # KNN分类 from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier() knn.fit(X_train,Y_train) knn.fit(X_train,Y_train) print(knn.score(X_validation,Y_validation)) # K折交叉验证 from sklearn.model_selection import cross_val_score scores = cross_val_score(knn,X,Y,cv=5,scoring='accuracy') print(scores) from sklearn.model_selection import KFold dfold = model_selection.KFold(n_splits=10,random_state=7) from sklearn import model_selection import matplotlib.pyplot as plt X=lris_df.data Y=lris_df.target k_range = range(1,31) k_scores = [] for k in k_range: knn = KNeighborsClassifier(n_neighbors=k) #调整K值 scores = model_selection.cross_val_score(knn,X,Y,cv=10,scoring='accuracy') k_scores.append(scores.mean()) plt.plot(k_range,k_scores) plt.xlabel('value of K for KNN') plt.ylabel('Cross-Validated Accuracy') plt.show()

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

今晚务必早点睡

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值