from numpy import *
import operator
#inx:待预测数据 dateSet:训练样本集 labels:训练样本的标签 k:k邻近算法的k值
def classify0(inX, dateSet, labels, k):
dateSetSize = dateSet.shape[0] #获得样本的个数(也就是dateset矩阵的行数)
diffMat = tile(inX, (dateSetSize, 1)) - dateSet #将inx复制dateSetSize个,然后减去所有样本
sqDiffMat = diffMat ** 2 #求平方
sqDistances = sqDiffMat.sum(axis=1) #矩阵sqDiffMat按行相加
distances = sqDistances ** 0.5 #开根号
sortedDistIndicies = distances.argsort() #获得distance排序的索引
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
#计算前k个最近的数据中,每个标记的个数
sortedClassCount = sorted(classCount.items(),
key = operator.itemgetter(1), reverse = True)#按每个标记出现次数,将标记从大到小排序
return sortedClassCount[0][0] #返回出现次数最多的标记的值
def autoNorm(dataSet):#归一化函数
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
m = dataSet.shape[0]
normDataSet = zeros(shape(dataSet))
normDataSet = dataSet - tile(minVals, (m, 1))
normDataSet = normDataSet/tile(ranges, (m,1))
return normDataSet, ranges, minVals
def file2matrix(filename):#读取文件函数
fr = open(filename)
arrayOLines = fr.readlines()
numberOfLines = len(arrayOLines)
returnMat = zeros((numberOfLines,3))
classLabelVector = []
index = 0
for line in arrayOLines:
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat,classLabelVector
def dataClassTest():#主函数
hoRatio = 0.1
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')#输入文件名
normMat, ranges, minVals = autoNorm(datingDataMat)#归一化处理数据
m = normMat.shape[0]
numTestVecs = int(m*hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:], normMat[numTestVecs:m,:],
datingLabels[numTestVecs:m], 3)
print ("the classifier came back with: {}, the real answer is: {}".format(classifierResult, datingLabels[i]))
if (classifierResult != datingLabels[i]): errorCount += 1.0
print ("the total error rate is: %f" % (errorCount / float(numTestVecs)))
python3实现K-邻近算法(机器学习实战中代码)
最新推荐文章于 2023-10-09 21:55:53 发布