KNN算法(鸢尾花)**
话不多说了,上代码啦!
在这里插入代码片
#knn算法复习
from numpy import *
import operator
from os import listdir
#使用KNN算法分类
'''def file2matrix(filename):
with open(filename)as file:
array0Lines=file.readlines()
number0fLines=len(array0Lines)
returnMat=zeros((number0fLines,3))
classLabelVector=[]
index=0
for line in array0Lines:
line=line.split()
returnMat[index,:]=line[0:3]
classLabelVector.append(int(line[-1]))
index+=1
print(type(returnMat))
return returnMat,classLabelVector
'''
#鸢尾花数据集分类
def file2matrix(filename):
with open(filename) as file:
array0Lines=file.readlines()
number0fLines=len(array0Lines)
returnMat=zeros((number0fLines,4))
classLabelVector=[]
index=0
returnMat1=[]
for line in array0Lines:
line=line.strip()
listFromLine = line.split(",")
classLabelVector.append(listFromLine[-1])
returnMat[index,:]=listFromLine[0:4]
index+=1
#print('classLabelVectorL:',classLabelVector)
#print(returnMat)
return returnMat,classLabelVector
def classify(inX,dataSet,labels,k):
dataSetSize = dataSet.shape[0] # 训练样本个数
diffMat = tile(inX, (dataSetSize, 1)) - dataSet # np.tile: 重复n次
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5 # distance是inX与dataSet的欧氏距离
sortedDistIndicies = distances.argsort() # 返回排序从小到达的索引位置
classCount = {} # 字典存储k近邻不同label出现的次数
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 # 对应label加1,classCount中若无此key,则默认为0
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) # operator.itemgetter 获取对象的哪个维度的数据
return sortedClassCount[0][0] # 返回k近邻中所属类别最多的哪一类
def autoNorm(dataSet):
minVals=dataSet.min(0)
maxVals=dataSet.max(0)
ranges=maxVals-minVals
normDataSet=zeros(shape(dataSet))
m=dataSet.shape[0]
normDataSet=dataSet-tile(minVals,(m,1))
normDataSet=normDataSet/tile(ranges,(m,1))
# print('m:',m)
# print('normDataSet:',normDataSet)
# print('ranges:',ranges)
# print('minVals:',minVals)
# print('maxVals:',maxVals)
#print('dataSet:',dataSet)
return normDataSet,ranges,minVals
def classifyPerson():
resultList=['不喜欢','一般般','超喜欢']
percentTats=float(input("玩游戏所耗的时间百分比?:"))
ffMiles=float(input('每周玩获得飞行常客里程数?:'))
iceCream=float(input('每周消费冰淇凌的公升数?:'))
datingDataMat,datingLabels=file2matrix(r"D:\learnpy\机器学习算法\datingTestSet2.txt")
normMat,ranges,minVals=autoNorm(datingDataMat)
inArr=array([ffMiles,percentTats,iceCream])
#print('inArr-minVals:',(inArr-minVals)/ranges)
classifierResult=classify((inArr-minVals)/ranges,normMat,datingLabels,3)
#print('你喜欢人的程度:',resultList[classifierResult-1])
#classifyPerson()
def test():
hoRatio=0.60#设置用来测试的样本占比列
#dataSet,labels=file2matrix(r"D:\learnpy\机器学习算法\datingTestSet2.txt")
dataSet,labels=file2matrix(r"D:\learnpy\机器学习算法\iris.data")
normDataSet,ranges,minVals=autoNorm(dataSet)
m=normDataSet.shape[0]
numTest=int(m*hoRatio)
errorCount=0.0
for i in range(numTest):
classifierResult=classify(dataSet[i,:],dataSet[numTest:m,:],labels[numTest:m],3)
print('classifier result:%s,real answer:%s'%(classifierResult,labels[i]))
if (classifierResult!=labels[i]):
errorCount+=1.0
print('the total error rate is:%f'%(errorCount/float(numTest)))
#手写数字识别
def image2vector(filename):
returnVector=zeros((1,1024))
with open(filename) as file:
for i in range(32):
line=file.readline()
for j in range(32):
returnVector[0,32*i+j]=int(line[j])
return returnVector
#k-近邻算法
#手写数字识别的测试代码
def handWritingClassTest():
labels=[]
trainingFileList=listdir(r'D:\learnpy\机器学习算法\trainingDigits') #获取训练集目录下的文件名
m=len(trainingFileList) #计算训练样本个数
dataSet=zeros((m,1024)) #初始化数据集
for i in range(m):
fileName=trainingFileList[i]
number=int(fileName.split('_')[0]) #number为每个样本的分类
labels.append(number)
dataSet[i,:]=image2vector(r'D:\learnpy\机器学习算法\trainingDigits\%s'%fileName)
testFileList=listdir(r"D:\learnpy\机器学习算法\testDigits")
mTest=len(testFileList)
error=0.0
for i in range(mTest):
fileName=testFileList[i]
number=int(fileName.split('_')[0])
testVector=image2vector(r"D:\learnpy\机器学习算法\testDigits\%s"%fileName)
resultNumber=classify(testVector,dataSet,labels,3) #使用算法估计样本所属类别
print("the classifier came back with: %d, the real answer is: %d" % (resultNumber, number))
if number!=resultNumber: #算法结果与样本的实际分类做对比
error+=1.0
print('the total number of errors is:',error)
print('the total error rate is:',error/mTest)
#handWritingClassTest()
if __name__=='__main__':
#returnMat,classLabelVector=file2matrix(r"D:\learnpy\机器学习算法\datingTestSet2.txt")
dataSet,labels=file2matrix(r"D:\learnpy\机器学习算法\iris.data")
# autoNorm(returnMat)
#handwritingClassTest()
test()
运行的时候注意文件路径,以免出错。
鸢尾花数据集合在uci有下载,如想要更多的数据集链接如下
uci数据集下载
运行结果@
仅供你们参考,如有不足,多多指点。