# -*- coding: utf-8 -*-
from numpy import *
import operator
import matplotlib.pyplot as plt
from os import listdir
def createDataSet():
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels = ['A','A','B','B']
return group,labels
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0] # 读取group的第一维度长度(行数),此处为4
# tile函数 tile(inX, i);扩展长度 tile(inX, (i,j)) ;i是扩展个数,j是扩展长度
diffMat = tile(inX, (dataSetSize,1)) - dataSet # tile将inX重复成一个4*1的矩阵再做减法得到[[-1. -1.1],[-1. -1. ],[ 0. 0. ],[ 0. -0.1]]
sqDiffMat = diffMat ** 2 # 每个元素平方
sqDistances = sqDiffMat.sum(axis = 1) # 没有axis参数表示全部相加,axis=0表示按列相加,axis=1表示按照行的方向相加
distances = sqDistances ** 0.5
sortedDistIndicies = distances.argsort() # 排序,并按顺序返回列表的索引index
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel , 0) + 1 # 前K个值分类,是哪个类那个类的个数+1
sortedClassCount = sorted(classCount.items(), # .items()将字典classCount中所有键值对以dict_items的形式返回
key = operator.itemgetter(1), reverse=True) # .itemgetter(1)方法按照第二个元素的次序对元祖进行排序
return sortedClassCount[0][0]
def file2matrix(filename):
# 将文本记录转换为Numpy的解析程序
fr = open(filename)
arrayOLines = fr.readlines()
numberOfLines = len(arrayOLines)
returnMat = zeros((numberOfLines,3)) # 创建以0 填充的矩阵numpy
classLabelVector = []
index = 0
for line in arrayOLines:
line = line.strip() # line.strip()截取掉所有的回车字符
listFromLine = line.split('\t') # 使用tab字符\t将上一步得到的整行数据分割成一个元素列表
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat,classLabelVector
def autoNorm(dataSet):
# 归一化特征值
minVals = dataSet.min(0) # 参数0 使得函数可以从列中选取最小值
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0] # 读取dataSet的第一维度长度(行数),此处为1000
normDataSet = dataSet - tile(minVals, (m,1))
newDataSet = normDataSet/tile(ranges, (m,1))
return newDataSet,ranges,minVals
def datingClassTest():
hoRatio = 0.1
datingDataMat, datingLabels =file2matrix('datingTestSet.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m * hoRatio) # 取10%做测试数据
errorCount = 0.0
for i in range(numTestVecs):
a = random.randint(1000) # 取0-999的随机数
classifierResult = classify0(normMat[a, :], normMat[numTestVecs:m, :],datingLabels[numTestVecs:m], 3)
print('the classifier came back with: %s,the real answer is: %s' % (classifierResult, datingLabels[i]))
if (classifierResult != datingLabels[a]):
errorCount += 1.0
print('the total error rate is: %f' % (errorCount/float(numTestVecs)))
def classifyPerson():
resultList = ['not at al', 'in small doses', 'in large doses']
percentTats = float(input('percentage of time spent playing video games?'))
ffMiles = float(input('frequent flier miles earned per year?'))
iceCream = float(input('liters of ice cream consumed per year?'))
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
inArr = array([ffMiles, percentTats, iceCream])
classifierResult = classify0((inArr-minVals)/ranges, normMat, datingLabels, 3) # 这里要把输入的数据归一化
print('You will probably like this person: ', resultList[classifierResult - 1])
def img2vector(filename):
returnVect = zeros((1,1024))
fr = open(filename)
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVect[0,32*i+j] = int(lineStr[j])
return returnVect
def handwrtingClassTest():
hwLabels = []
trainingFileList = listdir('trainingDigits') # listdir函数列出给定目录的文件名
m = len(trainingFileList) # 1934
trainingMat = zeros((m,1024))
for i in range(m):
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split('.')[0] # 返回一个下标从零开始到.字符串
classNumStr = int(fileStr.split('_')[0])
hwLabels.append(classNumStr)
trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr) # 将每个文件的01存储到trainingMat里
testFileList = listdir('testDigits')
errorCount = 0.0
mTest = len(testFileList) # 946
for i in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0]
classNumStr = int(fileStr.split('_')[0])
vectorUnderTest = img2vector('testDigits/%s' % fileNameStr)
classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr))
if (classifierResult != classNumStr):
errorCount += 1.0
print("\nthe total number of errors is: %d" % errorCount)
print("\nthe total error rate is: %f" % (errorCount / float(mTest)))
if __name__ == '__main__':
'''
group,labels = createDataSet()
G = classify0([0,0],group,labels,3)
print(G)
'''
'''
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
print(datingDataMat)
fig = plt.figure()
ax = fig.add_subplot(111)
# ax.scatter(datingDataMat[:,0],datingDataMat[:,1], 15.0*array(datingLabels), 15.0*array(datingLabels))
plt.xlabel('每年获取的飞行常客里程数')
plt.ylabel('玩视频游戏所耗时间百分比')
datingLabels = array(datingLabels)
idx_1 = where(datingLabels == '1')
ax.scatter(datingDataMat[idx_1, 0], datingDataMat[idx_1, 1], marker='*', color='r', label='不喜欢', s=20)
idx_2 = where(datingLabels == '2')
ax.scatter(datingDataMat[idx_2, 0], datingDataMat[idx_2, 1], marker='o', color='b', label='魅力一般', s=10)
idx_3 = where(datingLabels == '3')
ax.scatter(datingDataMat[idx_3, 0], datingDataMat[idx_3, 1], marker='+', color='g', label='极具魅力', s=30)
plt.legend(loc='upper left')
plt.show()
'''
# datingClassTest()
# classifyPerson()
handwrtingClassTest()
KNN近邻算法学习笔记——代码详细注释
最新推荐文章于 2022-07-08 00:58:10 发布
本文介绍了KNN近邻算法的实现过程,包括创建数据集、归一化处理、距离计算和分类。通过实例展示了如何使用KNN进行约会网站用户喜好预测和手写数字识别,代码包含详细注释。
摘要由CSDN通过智能技术生成