k-Nearest Neighbors (kNN).
It works like this: we have an existing set of example data, our training set. We have
labels for all of this data—we know what class each piece of the data should fall into.
When we're given a new piece of data without a label, we compare that new piece of
data to the existing data, every piece of existing data. We then take the most similar
pieces of data (the nearest neighbors) and look at their labels. We look at the top k
most similar pieces of data from our known dataset; this is where the k comes from. (k
is an integer and it’s usually less than 20.) Lastly, we take a majority vote from the k
most similar pieces of data, and the majority is the new class we assign to the data we
were asked to classify.
函数:
shape:获取矩阵的维数
tile(A, (x, y)):将矩阵在行上重复x次,在列上重复y次
sum(axis = 1):将每行加起来
argsort():从小到大排列,提取序数
sorted函数:http://www.runoob.com/python/python-func-sorted.html
operator.itemgetter:获取哪些维度
python 3 实现代码
from numpy import * # scientific computing package
import operator # for sorting
def createDataSet():
group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
labels = ['A', 'A', 'B', 'B']
return group, labels
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0] # the dimension of the matrix
diffMat = tile(inX, (dataSetSize, 1)) - dataSet # tile: repeat an array
sqDiffMat = diffMat**2 # square
sqDistances = sqDiffMat.sum(axis = 1) # plus the elements in each row
distances = sqDistances**0.5 # sqrt
sortedDistIndices = distances.argsort() # sort and extract index
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndices[i]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
sortedClassCount = sorted(classCount.items(),
key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
def file2matrix(filename):
fr = open(filename)
numberOfLines = len(fr.readlines())
returnMat = zeros((numberOfLines, 3))
classLabelVector = []
fr = open(filename)
index = 0
for line in fr.readlines():
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append((int(listFromLine[-1])))
index += 1
return returnMat, classLabelVector
def autoNorm(dataSet):
minVals = dataSet.min(0) # '0' means that we get minimums from rows, not columns
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - tile(minVals, (m, 1))
normDataSet = normDataSet/tile(ranges, (m, 1))
return normDataSet, ranges, minVals
def datingClassTest():
hoRatio = 0.10
datingDataMat, datingLabels = file2matrix("datingTestSet2.txt")
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m*hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:], normMat[numTestVecs:m,:],
datingLabels[numTestVecs:m], 3)
print("the classifier came back with: %d, the real answer is: %d" %
(classifierResult, datingLabels[i]))
if(classifierResult != datingLabels[i]):
errorCount += 1
print("the total error rate is: %f" % (errorCount/float(numTestVecs)))
def classifyPerson():
resultList = ['not at all', 'in small doses', 'in large doses']
percentTats = float(input("percentage of time spent on video games:"))
ffMiles = float(input("frequent flier miles earned per year:"))
iceCream = float(input("liters of ice cream consumed per year:"))
datingDataMat, datingLabels = file2matrix("datingTestSet2.txt")
normMat, ranges, minVals = autoNorm(datingDataMat)
inArr = array([ffMiles, percentTats, iceCream])
classifierResult = classify0((inArr-minVals)/ranges, normMat, datingLabels, 3)
print("You will probably like this person", resultList[classifierResult-1])