约会网站:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from numpy import *
import operator
import matplotlib.pyplot as plt
def classify0(inX, dataSet, labels, k):#kNN算法
#计算欧式距离
dataSetSize = dataSet.shape[0]#行数 1列数
diffMat = tile(inX, (dataSetSize,1)) - dataSet # 例:tile([1,2],(2,3)) 结果:[[1,2,1,2,1,2], [1,2,1,2,1,2]]
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)#axis=1, 表示按行方向相加 =0 按列方向相加
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort() #argsort函数返回的是数组值从小到大的索引值
classCount={}
for i in range(k):#前k个
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 #从字典classCount中取值,key为voteIlabel,如果没有返回0
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
#classCount.iteritems()以迭代器对象 返回键值对 key=operator.itemgetter(1)按第二个元素对元组排序 reverse=True 逆序 大到小
#operator模块提供的itemgetter函数用于获取对象的哪些维的数据
return sortedClassCount[0][0]
def file2matrix(filename):
fr = open(filename)
numberOfLines = len(fr.readlines()) # get the number of lines in the file
returnMat = zeros((numberOfLines, 3)) # prepare matrix to return
classLabelVector = [] # prepare labels return
fr = open(filename)
index = 0
for line in fr.readlines():
line = line.strip()#去除两侧\r\n
listFromLine = line.split('\t')
returnMat[index, :] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat, classLabelVector
def autoNorm(dataSet):#归一化数值 将任意取值范围的特征值转化为0到1区间内
#newValue = (oldValue-min)/(max-min)
minVals = dataSet.min(0)#列最小
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - tile(minVals, (m, 1))
normDataSet = normDataSet / tile(ranges, (m, 1)) # element wise divide
return normDataSet, ranges, minVals
def datingClassTest():
hoRatio = 0.10 # hold out 10%
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt') # load data setfrom file
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m * hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i])
if (classifierResult != datingLabels[i]): errorCount += 1.0
print "the total error rate is: %f" % (errorCount / float(numTestVecs))
print errorCount
def createPlot():
fig = plt.figure()
ax = fig.add_subplot(111) # 349 将画布分割成3行4列,图像画在从左到右从上到下的第9块
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
# ax.scatter(datingDataMat[:,1], datingDataMat[:,2])
ax.scatter(datingDataMat[:, 1], datingDataMat[:, 2], 15.0 * array(datingLabels), 15.0 * array(datingLabels))
ax.axis([-2, 25, -0.2, 2.0])
plt.xlabel('Percentage of Time Spent Playing Video Games')
plt.ylabel('Liters of Ice Cream Consumed Per Week')
plt.show()
if __name__ == '__main__':
createPlot()
datingClassTest()
datingTestSet2.txt
40920 8.326976 0.953952 3
14488 7.153469 1.673904 2
26052 1.441871 0.805124 1
75136 13.147394 0.428964 1
35948 6.830792 1.213192 3
手写字识别:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from numpy import *
import operator
from os import listdir
def classify0(inX, dataSet, labels, k):#kNN算法
#计算欧式距离
dataSetSize = dataSet.shape[0]#行数 1列数
diffMat = tile(inX, (dataSetSize,1)) - dataSet # 例:tile([1,2],(2,3)) 结果:[[1,2,1,2,1,2], [1,2,1,2,1,2]]
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)#axis=1, 表示按行方向相加 =0 按列方向相加
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort() #argsort函数返回的是数组值从小到大的索引值
classCount={}
for i in range(k):#前k个
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 #从字典classCount中取值,key为voteIlabel,如果没有返回0
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
#classCount.iteritems()以迭代器对象 返回键值对 key=operator.itemgetter(1)按第二个元素对元组排序 reverse=True 逆序 大到小
#operator模块提供的itemgetter函数用于获取对象的哪些维的数据
return sortedClassCount[0][0]
def img2vector(filename):
returnVect = zeros((1,1024))
fr = open(filename)
for i in range(32):
lineStr = fr.readline()
for j in range(32):
returnVect[0,32*i+j] = int(lineStr[j])
return returnVect
def handwritingClassTest():
hwLabels = []
trainingFileList = listdir('digits/trainingDigits') #load the training set
m = len(trainingFileList)
trainingMat = zeros((m,1024))
for i in range(m):
fileNameStr = trainingFileList[i]
fileStr = fileNameStr.split('.')[0] #6_43.txt take off .txt = 6_43
classNumStr = int(fileStr.split('_')[0]) #6_43 take off _43 = 6
hwLabels.append(classNumStr)
trainingMat[i,:] = img2vector('digits/trainingDigits/%s' % fileNameStr)
testFileList = listdir('digits/testDigits') #iterate through the test set
errorCount = 0.0
mTest = len(testFileList)
for i in range(mTest):
fileNameStr = testFileList[i]
fileStr = fileNameStr.split('.')[0] #6_43.txt take off .txt = 6_43
classNumStr = int(fileStr.split('_')[0]) #6_43 take off _43 = 6
vectorUnderTest = img2vector('digits/testDigits/%s' % fileNameStr)
classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
print "the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr)
if (classifierResult != classNumStr): errorCount += 1.0
print "\nthe total number of errors is: %d" % errorCount
print "\nthe total error rate is: %f" % (errorCount/float(mTest))
if __name__ == '__main__':
handwritingClassTest()
digits/trainingDigits/0_0.txt:
00000000000001111000000000000000
00000000000011111110000000000000
00000000001111111111000000000000
00000001111111111111100000000000
00000001111111011111100000000000
00000011111110000011110000000000
00000011111110000000111000000000
00000011111110000000111100000000
00000011111110000000011100000000
00000011111110000000011100000000
00000011111100000000011110000000
00000011111100000000001110000000
00000011111100000000001110000000
00000001111110000000000111000000
00000001111110000000000111000000
00000001111110000000000111000000
00000001111110000000000111000000
00000011111110000000001111000000
00000011110110000000001111000000
00000011110000000000011110000000
00000001111000000000001111000000
00000001111000000000011111000000
00000001111000000000111110000000
00000001111000000001111100000000
00000000111000000111111000000000
00000000111100011111110000000000
00000000111111111111110000000000
00000000011111111111110000000000
00000000011111111111100000000000
00000000001111111110000000000000
00000000000111110000000000000000
00000000000011000000000000000000