1、knn算法的python实现
2、参考machine learning in action
import numpy as np
from os import listdir
class knn():
def __init__(self,x,y,k=3):
self.k=k
self.x=x
self.y=y
def classify(self,inx):
#calculate distance between inx and train_x
data=self.x-inx
data=data**2
data=data.sum(axis=1)
distance=data**0.5
sorted_ditance_index=distance.argsort()
classcount={}
for i in range(self.k):
votelable=self.y[sorted_ditance_index[i]]
classcount[votelable]=classcount.get(votelable,0)+1
sortedclasscount=sorted(classcount.iteritems(),key=lambda x:x[1],reverse=True)
return sortedclasscount[0][0]
def accuracy(self,test_x,test_y):
num=0
total=len(test_y)
for i in range(total):
result=self.classify(test_x[i])
if result==test_y[i]:
num+=1
print "testdata accuracy is %f "%(1.0*num/total)
def createDataSet():
group = np.array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels = ['A','A','B','B']
return group, labels
def loaddata(path):
data=np.loadtxt(path,dtype={"names":("x1","x2","x3","labels"),"formats":("f4","f4","f4","S20")})
m=len(data)
n=len(data[0])
x=np.zeros((m,n-1))
y=[]
for i in range(m):
for j in range(n-1):
x[i][j]=data[i][j]
y.append(data[i][-1])
return x,y
def autoNorm(x):
max=x.max(axis=0)
min=x.min(axis=0)
ranges=max-min
normdata=(x-min)/(1.0*ranges)
return normdata
def img2vector(filename):
returnVect = np.zeros((1,1024))
fr = open(filename)
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVect[0,32*i+j] = int(lineStr[j])
return returnVect
def load_handwritingdata():
train_y = []
trainfile='D:\\SelfLearning\\Machine Learning\\MachineLearningInAction\\machinelearninginaction\\Ch02\\digits\\trainingDigits'
trainingFileList = listdir(trainfile) #load the training set
m = len(trainingFileList)
train_x = np.zeros((m,1024))
for i in range(m):
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split('.')[0] #take off .txt
classNumStr = int(fileStr.split('_')[0])
train_y.append(classNumStr)
train_x[i,:] = img2vector(trainfile+'\\%s' % fileNameStr)
testfile='D:\\SelfLearning\\Machine Learning\\MachineLearningInAction\\machinelearninginaction\\Ch02\\digits\\testDigits'
testFileList = listdir(testfile) #iterate through the test set
mTest = len(testFileList)
test_x = np.zeros((mTest,1024))
test_y=[]
for i in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0] #take off .txt
classNumStr = int(fileStr.split('_')[0])
test_y.append(classNumStr)
test_x[i,:] = img2vector(testfile+'\\%s' % fileNameStr)
return train_x,train_y,test_x,test_y
x,y=loaddata("D:\\SelfLearning\\Machine Learning\\MachineLearningInAction\\machinelearninginaction\\Ch02\\datingTestSet.txt")
x=autoNorm(x)
num_train_vectors=int(len(x)*0.7)
Knn=knn(x[:num_train_vectors],y[:num_train_vectors])
Knn.accuracy(x[num_train_vectors:],y[num_train_vectors:])
#handwritting test
train_x,train_y,test_x,test_y=load_handwritingdata()
handwrittingtest=knn(train_x,train_y)
handwrittingtest.accuracy(test_x,test_y)
实验结果 :
testdata accuracy is 0.963333
testdata accuracy is 0.988372