记录一下knn经典算法,100行代码50行注释的那种,直接copy就能执行成功
from numpy import *
import operator
# 数据读取
def file2matrix(filename):
fr = open(filename)
numberOfLines = len(fr.readlines())
returnMat = zeros((numberOfLines, 3)) # 生成对应行数的零矩阵;returnMats是返回的数据集列表
classLabelVector = [] # return的标签列表
fr = open(filename)
index = 0
for l in fr.readlines():
# 以行为单位删除前后空格
line = l.strip()
# 以\t切割字符串,前三列存入数据集列表中
listFormLine = line.split('\t')
# listFormLine中[0,3)个元素放入returnMat第index行中
returnMat[index, :] = listFormLine[0:3]
# 最后一列标签列存到标签列表中
classLabelVector.append(int(listFormLine[-1]))
index += 1
return returnMat, classLabelVector
# 数据集归一化
def autoNorm(dataSet):
'''
dataSet:数据集
归一化公式:Y=(X-Xmin)/(Xmax-Xmin)
'''
# 求出最大值,最小值,范围
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet)) # shape(array)看array是几行几列
m = dataSet.shape[0] # shape[0]查看行,shape[1]查看列
# tile(minVal,(m,1)将minVal重复m行1列
normDataSet = dataSet - tile(minVals, (m, 1)) # 减去最小值
normDataSet = normDataSet / tile(ranges, (m, 1)) # 除以范围
return normDataSet, ranges, minVals
# KNN算法,X是输入的归一化之后的数据,dataSet是归一化后的数据集(3列)
def classify0(intX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
# 计算欧氏距离:所有数据都与输入的算(把输入的数据重复dataSetSize行)
diffMat = tile(intX, (dataSetSize, 1)) - dataSet
sqDiffMat = diffMat ** 2
sqDistance = sqDiffMat.sum(axis=1) # 将矩阵每一行中的向量相加:[[0,1,2] [2,1,3]]-》[3,6]
distance = sqDistance ** 0.5
# 排序,argsort()用法,表示对数据进行从小到大进行排序,返回数据的索引值。
sortedDistIndicies = distance.argsort()
# 选K个,判断所属类别
classCount = {} # 字典,统计每个标签的出现次数
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
# classCount.get(voteIlabel, 0)在字典里找voteIlabel,没有就默认0,有几就是几
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 # 出现次数加1
# sorted(可迭代对象,自定义排序逻辑,True为降序False为升序(默认))
# dict.items()以列表返回可遍历的(键, 值) 元组数组
# operator.itemgetter(0)按字典的key排序;operator.itemgetter(1)按value排序
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
# KNN测试
def datingClassTest():
hoRatio = 0.1 # 测试集数据与训练集数据比例
# 调用数据读取函数获取数据集和标签列
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
# 调用数据归一化函数,归一化数据
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0] # m为数据的行数即矩阵的第一维
numTestVecs = int(m * hoRatio) # 测试样本的数量
print("numTestVecs=", numTestVecs)
errorCount = 0.0
for i in range(numTestVecs):
classfierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
print("the classifier came back with:%d,the real answer is:%s"%(classfierResult, datingLabels[i]))
if classfierResult != datingLabels[i]:
errorCount += 1
print("total error rate is:", errorCount / float(numTestVecs))
print("errorCount:", errorCount)
# 约会网站预测函数
def classifyPerson():
resultList = ['not at all', 'in small doses', 'in large doses']
percentTats = float(input("percentage of time spent playing video games?"))
ffMiles = float(input("flier miles earned per year?"))
iceCream = float(input("liters of ice cream consumed per year?"))
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
inArr = array([ffMiles, percentTats, iceCream])
# 将输入的数据归一化后传入knn算法函数
classifierResult = classify0((inArr - minVals) / ranges, normMat, datingLabels, 3)
print("you will probably like this person:", resultList[classifierResult - 1])
if __name__ == '__main__':
#classifyPerson()
datingClassTest()