实例之数字识别
import matplotlib.pyplot as plt
import scipy.ndimage as ndimage
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
# 1、先将图片读入
# img_arr=plt.imread('C:/Users/lenovo/Desktop/python_use/digist/3/3_10.bmp')
# print(img_arr.shape) #表示3的第十张照片
# plt.imshow(img_arr)
# plt.show()
feature = [] #用来存储5000张图片对应的numpy数组,存每张照片的特征
target = [] #用来存储5000张图片的标签也就是代表的数字数
#由于每种类型的图片单独保存在同一个文件夹,所以需要对文件夹进行单独读取,双重循环
for i in range(10): # i表示外层文件夹的名称
for j in range(1,501): # j表示内层文件夹的名称的一部分
# 拼接好每一张图片的地址
img_path = 'C:/Users/lenovo/Desktop/python_use/digist/'+str(i)+'/'+str(i)+'_'+str(j)+'.bmp'
img_arr = plt.imread(img_path)
feature.append(img_arr)
target.append(i)
# print(feature)
# 切记现在的feature是列表,target也是列表,需要转成数组
# print(target)
feature = np.array(feature)
target = np.array(target)
# 需要查看数组的维度,因为训练模型用的是二维的
print(feature.shape)
print(target.shape)
# 将特征维度为3的转换成维度为2的数组
feature = feature.reshape((5000,28*28)) # 5000那一列不动,28*28设为一维
print(feature.shape)
# 考虑到数据是图片,没办法进行量化特征,直接用原始数据
x_train,x_test,y_train,y_test = train_test_split(feature,target,train_size=0.8,random_state=2525)
# 根据交叉验证和学习曲线找寻模型最优超参数
ks = np.arange(1,100,3)
scores = []
for ks_i in ks:
knn = KNeighborsClassifier(ks_i)
score = cross_val_score(knn,x_train,y_train,cv=5).mean()
scores.append(score)
scores = np.array(scores)
best_k = ks[scores.argmax()]
print('模型的最优参数:'+str(best_k))
# plt.plot(ks,scores)
# plt.xlabel('k')
# plt.ylabel('score')
# plt.show()
# # 找出最优参数,带入模型中
# knn = KNeighborsClassifier(n_neighbors=1)
# knn.fit(x_train,y_train)
# print('模型识别的结果:', knn.predict(x_test))
# print('真实的结果:', y_test)
test_n = plt.imread('C:/Users/lenovo/Desktop/python_use/123.jpg')
# plt.imshow(test_n)
# plt.show()
print(test_n.shape)
zero_test = test_n[5:140, 180:290]
eight_test = test_n[450:580, 180:290]
# plt.imshow(zero_test)
# plt.show()
# plt.imshow(eight_test)
# plt.show()
# 查看此时的要测试的图片是声明样式的与原来的样式不一样
print(zero_test.shape,eight_test.shape)
# 对推按压缩需要用到的包是:import scipy.ndimage as ndimage
# 调用ndimage里的zoom,需要放入两个参数,1、你要压缩的图片数组,2、你图片的大小/压缩后的大小。比如从(58,50)压缩到(5,5),输入(5/58,5/58)
zero_test_zoom = ndimage.zoom(zero_test,zoom=(28/135,28/110))
eight_test_zoom = ndimage.zoom(eight_test,zoom=(28/130,28/110))
print(zero_test_zoom.shape,eight_test_zoom.shape)
# plt.imshow(X=zero_test_zoom)
# plt.show()
# plt.imshow(X=eight_test_zoom)
# plt.show()
knn_test = KNeighborsClassifier(n_neighbors=best_k)
knn_test.fit(X=x_test,y=y_test)
n1 = knn_test.predict(zero_test_zoom.reshape(1,784))
print(n1)
n2 = knn_test.predict(eight_test_zoom.reshape(1,784))
print(n1,n2)
结果是:
(5000, 28, 28)
(5000,)
(5000, 784)
模型的最优参数:1
(604, 500)
(135, 110) (130, 110)
(28, 28) (28, 28)
[0]
[0] [8]
Process finished with exit code 0