import numpy as np
import operator
#kNN算法代码
def classify0(Inx,DataSet,labels,k):
DataNumber,DataSize=DataSet.shape
Sub_Data=DataSet-np.tile(Inx,(DataNumber,1))
sqr_Sub=Sub_Data**2
sqr_Norm=np.sum(sqr_Sub,axis=1)
Normal_Sub=sqr_Norm**0.5
sorted_Index=Normal_Sub.argsort()
label_dict={}
for i in range(k):
Userful_label=labels[sorted_Index[i]]
label_dict[Userful_label]=label_dict.get(Userful_label,0)+1
label_list=sorted(label_dict.items(),key=operator.itemgetter(1),reverse=True)
return label_list[0][0]
#文本到数据的转化
def text2data(path):
file=open(path)
datalist=[]
labels=[]
fr=file.readline()
while fr:
vector=np.zeros((1,1024))
for i in range(32):
for j in range(32):
vector[0,32*i+j]=int(fr[j])
fr=file.readline()
labels.append(int(fr))
datalist.append(vector)
fr=file.readline()
data_number=len(datalist)
dataset=np.zeros((data_number,1024))
row=0
for list in datalist:
dataset[row,:]=list[:]
row+=1
return labels,dataset
#测试模块 path1:训练集路径 path2:测试集路径
def Test_optdigit(path1,path2):
tra_labels,tra_dataset=text2data(path1)
test_labels,test_dataset=text2data(path2)
test_number=test_dataset.shape[0]
error_number=0
for i in range(test_number):
test2label=classify0(test_dataset[i,:],tra_dataset,tra_labels,20)
print('the classifier came back answer is %d and the right answer is %d' % (test2label,test_labels[i]))
if test2label != test_labels[i]:
error_number+=1
error_rate=error_number/test_number
return error_rate
print(Test_optdigit('F:/MachineLearning/data/optdigit.txt','F:/MachineLearning/data/optdigit_test.txt'))
这是我学习完《机器学习实战》这本书后所写的用来处理1到9数字的手写识别代码。数据拓展名为text,我从http://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of_Handwritten+Digits
网站上所下载的数据格式,这里只例举两个:
00000000000000000011000000000000
00000000000000000111111100000000
00000000000000000111111100000000
00000000000000000111111100000000
00000000000000000111111100000000
00000000000000000111111100000000
00000000000000000011111110000000
00000000000000000111111110000000
00000000000000000111111100000000
00000000000000001111111000000000
00000000000000001111111000000000
00000000000000001111111000000000
00000000000011111111111000000000
00000000000111111111111000000000
00000000001111111111111000000000
00000000011111111111111000000000
00000001111111111111110000000000
00000011111111111111110000000000
00000011111111111111110000000000
00000011111111111111110000000000
00000001111111111111111000000000
00000000000000011111111000000000
00000000000000011111111000000000
00000000000000011111110000000000
00000000000000001111111000000000
00000000000000011111111000000000
00000000000000011111111000000000
00000000000000000111111100000000
00000000000000000111111110000000
00000000000000000111111111000000
00000000000000000001111110000000
00000000000000000001111100000000
1
00000000000000000000111100000000
00000000000000000011111110000000
00000000000000001111111111000000
00000000000000011111111111100000
00000000000001111111011111111000
00000000000011111100000111111100
00000000000111111100000111111100
00000000001111100000000111111000
00000000001111000000000111111000
00000000001111000000000111111000
00000000111100000000001111110000
00000000011100000000001111110000
00000001111000000000111111100000
00000001111000000000111111100000
00000001111000000001111111100000
00000001111000111111111111000000
00000001111111111111111111000000
00000011111111111111011111000000
00000011111111110000111100000000
00000001111111100000111100000000
00000000000000000000111000000000
00000000000000000001111000000000
00000000000000000001110000000000
00000000000000000011100000000000
00000000000000000011100000000000
00000000000000000011100000000000
00000000000000000111100000000000
00000000000000000111100000000000
00000000000000000111000000000000
00000000000000001111000000000000
00000000000000001111100000000000
00000000000000011100000000000000
9
首先我们在classify0
函数下写出kNN算法:需要训练集数据dataset
,需要预测的数据Inx
,训练集的标记labels
,及选定的k
值。dataset的格式为np.array([[1,2,3,4],[1,1,1,1]])
,labels格式为列表,即每个元素按顺序对应训练集中每个样本。
然后text2data
是把如上形式的数据改成如np.array([...])
的形式,因为以上数据为32行,每行有32个0或1的数字,所以一条样本由1*1024的numpy数组表示。撤回由每个样本组成的dataset和每个样本的标记。
最后我们调用测试集optdigit_test.text
,以optdigit.text
为训练集,测试测试集中每个样本的训练情况并计算错误率。
得到结果如下,这里只例举10条,及在所有测试集上的错误率:
the classifier came back answer is 3 and the right answer is 8
the classifier came back answer is 1 and the right answer is 1
the classifier came back answer is 0 and the right answer is 0
the classifier came back answer is 8 and the right answer is 8
the classifier came back answer is 6 and the right answer is 6
the classifier came back answer is 7 and the right answer is 9
the classifier came back answer is 7 and the right answer is 9
the classifier came back answer is 0 and the right answer is 0
the classifier came back answer is 1 and the right answer is 3
the classifier came back answer is 9 and the right answer is 9
...
0.04559915164369035