'''
kNN.py
'''
from numpy import *
from operator
def createDataSet():
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels = ['A','A','B','B']
return group,labels
'''
intX: 输入向量,与dataset矩阵矩阵的行数一致
k: 邻近的个数
'''
def classify0(intX,dataSet,labels,k):
dataSetSize = dataSet.shape[0]
# 计算欧式距离
diffMat = tile(intX,(dataSetSize,1)) - dataSet # 矩阵减法
# tile(A,reps) A沿各个维度重复的次数
sqDiffMat = diffMat ** 2
sqDistances = sqDiffMat.sum(axis = 1) # 矩阵行相加,列的参数是0
distances = sqDistances ** 0.5
#print("distances:")
#print distances
sortedDistIndicies = distances.argsort() # argsort函数返回的是数组值从小到大的索引值
#print("sortedDistIndicies:")
#print sortedDistIndicies
classCount ={} #dict 类型
for i in range(k):
voteIlabel = label[sortedDistIndicies[i]]
#print ("i: %d, sortedDistIndicies[%d]: %d"%(i,i,sortedDistIndicies[i]))
#print ("voteIlabel:%s"%(voteIlabel))
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
#D.get(k[, d]) => D[k] if k in D else d. d defaults to none.提取标签出现的个数
#print ("classCount[%s]:%d"%(voteIlabel,classCount[voteIlabel]))
sortedClassCount = sorted(classCount.iteritems(), key = operator.itemgetter(1), reverse = True)
#classCount.iteritems()遍历
#operator.itemgetter函数选取第几个维的数字,和sorted一起用可根据选出的那个维进行排序
return sortedClassCount[0][0]
def file2matrix(filename):
fr = open(filename)
arrayOfLines = fr.readlines()
numberOfLines = len(arrayOfLines)
returnMat = zeros((numberOfLines,3)) #3列矩阵
classLabelVector = [] #list
index = 0
for line for arrayOfLines:
line = line.strip() # 去除回车
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1])) # 使用索引-1表示列表中的最后一列
index += 1
return returnMax, classLabelVector
'''
归一化
newValue = (oldValue - minValue)/(maxValue - minValue)
'''
def autoNorm(dataSet):
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = normDataSet - tile(minVals,(m,1))
normDataSet = normDataSet / tile(ranges,(m,1))
return normDataSet, ranges, minVals
'''
Test
'''
def datingDataTest():
hoRatio = 0.10
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
normTestVecs = int (m * hoRatio)
errorCount = 0.0
for i in range(normTestVecs):
classifierResult = classify0(normMat[i,:],\
normMat[normTestVecs:m,:],\
datingLabels[normTestVecs:m],\
3)
print ("the classifier came back with: %d, the real answer is:%d"\
%(classifierResult,datingLabels[i]))
if(classifierResult != datingLabels[i]):
errorCount += 1.0
print ("the total error rate is: %f" % (errorCount/float(numTestVecs)))
'''
classify
'''
def classifyPerson():
resultList = ['not at all','in small doses','in large doses']
percentTats = float(raw_input\
("percentage of time spent playing video games?"))
ffMiles = float(raw_input\
("frequent flier miles earned per year?"))
iceCream = float(raw_input\
("liters of ice cream consumed per year?"))
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
normMat, ranges, minVals = autonorm(datingDataMat)
inArr = array([percentTats,ffMiles,iceCream])
classifierResult = classify0((inArr-minVals)/ranges,\
normMat,\
datingLabels,\
3)
print ("You will probably like this person: ",\
resultList[classifierResult-1])
机器学习实战:ch02-1
最新推荐文章于 2024-07-28 15:46:11 发布