KNN机器学习实战(代码有修改)
K近邻分类算法
优点:精度高、对异常值不敏感、无数据输入假定
缺点:计算复杂度高、空间复杂度高
适用数据范围:数值型和标称型
归一化数值
def Normalize(dataset):
minVals = dataset.min(0)
maxVals = dataset.max(0)
ranges = maxVals - minVals
norm_set = dataset - minVals
norm_set = norm_set / ranges
return norm_set, ranges, minVals
代码 手写数字识别
import numpy as np
import matplotlib.pyplot as plt
import operator
import os
#inX 用于分类的输入向量
#datase 训练样本
#labels 标签
#k 选择最近邻居的数目
def classify0(inX, dataset, labels, k): #k最好为单数
dataset_size = dataset.shape[0] #计算多样本数
diffMat = np.tile(inX, (dataset_size, 1)) - dataset #np.tile(a,(2,1)) [[a];[a]]
sqDiffMat = diffMat**2
sqDistance = sqDiffMat.sum(axis=1)
distances = sqDistance**0.5
sort_distances = distances.argsort() #返回最小值的索引 array 数组
classcount = {}
for i in range(k):
votelabel = labels[sort_distances[i]]
classcount[votelabel] = classcount.get(votelabel,0) + 1 #字典get value 并+1次
sort_classcount = sorted(classcount.items(),
key = operator.itemgetter(1),reverse=True) #oprator.itemgetter(1) {'B':0;'A':1} 按照0 1 排序 False 升序 选取点数最多的点
return sort_classcount[0][0]
##文本读取 可删除 与手写 无关
def file2matix(filename):
fr = open(filename)
arrayOlines = fr.readlines()
numberOflines = len(arrayOlines)
returnMat = np.zeros((numberOflines,3))
classlabelv = []
index = 0
for line in arrayOlines:
line = line.strip() # 脱掉回车符号 字符串常用
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classlabelv.append(listFromLine[-1])
index += 1
return returnMat,classlabelv
datingMat, datalabels = file2matix('datingTestSet.txt')
#print(datingMat,datalabels)
#plt.scatter(datingMat[:,1],datingMat[:,2],15.0*np.array(datalabels),15.0*np.array(datalabels))
#plt.show()
def datingClassTest():
Ratio = 0.1
dateMat,dataLabels = file2matix('datingTestset.txt')
normMat, ranges, minVlas = Normalize(dateMat)
numTest = int(Ratio * normMat.shape[0])
erro = 0
for i in range(numTest):
classResult = classify0(normMat[i],normMat[numTest:],dataLabels[numTest:],2)
print('the came back with: %s, the real answer is %s'%(classResult,dataLabels[i]))
if classResult != dataLabels[i]:
erro +=1.0
print("total error is %f" % (erro/float(numTest)))
def classPerson():
results = ['not at all','in small dose','in large doess']
percents = float(input('percentage of time spent playing video games'))
ffMlies = float(input("liters filter miles earned per year"))
iceCrems = float(input("ice cream consumed per year"))
dataM, datals = file2matix('datingTestSet2.txt')
normM, ranges, minVlas = Normalize(dataM)
test_person = np.array([percents,ffMlies,iceCrems])
Results_P = classify0(test_person,normM,datals,5)
print("you will probably like this person:",results[int(Results_P) - 1])
##以下为手写knn数字识别 读取文本数据代码
def img2vector(filename):
returnVector = np.zeros((1,1024)) #1*1024 numpy数组
fr = open(filename)
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVector[0,32*i+j] = int(lineStr[j]) # 32*32 转化为1*1024 数组
return returnVector
def Handclassy():
trian_label=[]
foldername = 'trainingDigits'
filenames = os.listdir(foldername)
triandata = np.zeros((len(filenames),1024))
for i in range(len(filenames)):
filename = os.path.join(foldername,filenames[i])
vec=img2vector(filename)
triandata[i]=vec
trian_label.append(filename.split('.')[0].split('\\')[-1].split('_')[0])
foldername2 = 'testDigits'
filename2s = os.listdir(foldername2)
error = 0
for i in range(len(filename2s)):
filename2 = os.path.join(foldername2,filename2s[i])
test_vec = img2vector(filename2)
testlabel=filename2.split('.')[0].split('\\')[-1].split('_')[0]
HclassResult = classify0(test_vec,triandata,trian_label,3)
print("the result is %s,actual is %s" %(HclassResult,testlabel))
if HclassResult != testlabel:
error += 1.0
print("Totol numer of error is%f" %(error))
print(“error:”error/float(len(filename2s)))
if __name__=='__main__':
Handclassy()
结果
Totol numer of error is10.000000
error:0.010570824524312896