创建一个数组,并得到这个数组中的每一行中的最小值和最大值。
>>> import numpy as no
>>> dataSet = no.array([[0.8,400,0.5],[12,134000,0.9],[0,20000,1.1],[67,32000,0.1]])
>>> dataSet
array([[ 8.00000000e-01, 4.00000000e+02, 5.00000000e-01],
[ 1.20000000e+01, 1.34000000e+05, 9.00000000e-01],
[ 0.00000000e+00, 2.00000000e+04, 1.10000000e+00],
[ 6.70000000e+01, 3.20000000e+04, 1.00000000e-01]])
>>> dataSet.min(0)
array([ 0.00000000e+00, 4.00000000e+02, 1.00000000e-01])
>>> minVals = dataSet.min(0)
>>> maxVals = dataSet.max(0)
>>> ranges = maxVals - minVals
>>> ranges
array([ 6.70000000e+01, 1.33600000e+05, 1.00000000e+00])
>>> normDataSet = no.zeros(shape(dataSet))
创建一个和上面的dataSet一样的数组,并用0填充
>>> normDataSet = no.zeros(no.shape(dataSet))
>>> normDataSet
array([[ 0., 0., 0.],
[ 0., 0., 0.],
[ 0., 0., 0.],
[ 0., 0., 0.]])
取出normDataSet中的行数和列数
m = dataSet.shape[0]#行数4
n = dataSet.shape[1]#列数3
>>> dataSet.min(0)<span style="font-family: Arial, Helvetica, sans-serif;">#取出每一列中的最小值</span>
array([ 0.00000000e+00, 4.00000000e+02, 1.00000000e-01])
>>> minVals = dataSet.min(0)<span style="font-family: Arial, Helvetica, sans-serif;">#取出每一列中的最小值</span>
>>> maxVals = dataSet.max(0)<span style="font-family: Arial, Helvetica, sans-serif;">#取出每一行中的最小值</span>
>>> ranges = maxVals - minVals
>>> ranges
array([ 6.70000000e+01, 1.33600000e+05, 1.00000000e+00])
>>> bbbb = no.tile(minVals,(m,1))#minVals的值上面有,m的值为4,创建m行个和minVals一样的数组
>>> bbbb
array([[ 0.00000000e+00, 4.00000000e+02, 1.00000000e-01],
[ 0.00000000e+00, 4.00000000e+02, 1.00000000e-01],
[ 0.00000000e+00, 4.00000000e+02, 1.00000000e-01],
[ 0.00000000e+00, 4.00000000e+02, 1.00000000e-01]])
>>> minVals
array([ 0.00000000e+00, 4.00000000e+02, 1.00000000e-01])
利用库matplotlib画图,并显示
>>> import matplotlib
>>> import matplotlib.pyplot as plt
f
>>> fig = plt.figure()#创建一个图
>>> ax = fig.add_subplot(111)#前俩1是比例,后一个1是位置,可以改成222,或223查看效果,一目了然
>>> ax.scatter(datingDataMat[:,1],datingDataMat[:,2],15.0*no.array(datingLabels),15.0*no.array(datingLabels))
<matplotlib.collections.PathCollection object at 0x0226EEB0>
>>> plt.show()
另附:机器学习那本书上的约会代码,创建knn.py文件,放入下面代码:
下面代码的作用是,利用knn近邻算法构建一个约会网站系统
之后对这个网站系统的错误率进行计算,并给定一个人的信息,对这个网站进行测试,看看是不是当事人(这里是海伦)喜欢的类型
海伦通过三个参数判断是不是自己喜欢的类型。先通过一个datingTestSet2.txt文件中的1000个人的信息中的900个作为训练,再用剩下的100个测试
这三个参数是ffMiles、percentTats、iceCream,每年飞行的里程数、玩视频游戏所消耗的百分比、每周消费冰淇淋的公升数
# -*- coding: cp936 -*-
#构造完整的k-临近算法之前,我们先编写一些基本的通用函数
from numpy import *
import operator
#这个函数作为简单的训练数据进行简单的测试下,由于 矩阵简单,容易理解,基本上和下面没什么关系。基本上也没什么用到
#主要用到他的是
import knn
group,labels = knn.createDataSet()
knn.classify0([0,0],group,labels,3)
就可以看到结果了,就是group中的每行和[0,0]作欧式距离。找最小的。
def createDataSet():
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels = ['A','A','B','B']
return group,labels
#k--近邻算法
#inX:每一行和它做欧式距离。将最小的找出来
def classify0(inX,dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = tile(inX, (dataSetSize,1)) - dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort()
classCount = {}
for i in range(k): #取前k个
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
sortedClassCount = sorted(classCount.iteritems(),
key = operator.itemgetter(1),reverse=True)
return sortedClassCount[0][0]
#用来处理输入格式的问题
def file2matrix(filename):
fr = open(filename)
arrayOLines = fr.readlines()
numberOfLines = len(arrayOLines)
returnMat = zeros((numberOfLines,3))
classLabelVector = []
index = 0
for line in arrayOLines:
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat,classLabelVector
#归一化特征值
def autoNorm(dataSet):
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - tile(minVals,(m,1))#dataSet中的值减去每一列中的最小值
normDataSet = normDataSet/tile(ranges,(m,1))
return normDataSet, ranges, minVals
#为了测试分类效果,该函数是自包含的,可以在任何时候在Python运行环境中使用该函数测试分类器效果
#下面用它来测试约会网站的测试代码
def datingClassTest():
hoRatio = 0.10
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m*hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
#normMat[i,:]一行的数据,normMat[numTestVecs:m,:]100--1000行的900条数据
#用来训练,一行和所有的行做欧式距离,取前3个
classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],\
datingLabels[numTestVecs:m],3)
print('lsp: ',classifierResult)
print('the classifier came back with: %d ,the real answer is: %d' % (classifierResult, datingLabels[i]))
if (classifierResult != datingLabels[i]):errorCount +=1.0
print('the totle error rate is : %f' % (errorCount/float(numTestVecs)))
#约会网站预测函数,输入一个人的信息,看海伦对他的兴趣度
def classifyPerson():
resultList = ['not at all','in a small doses','in large doses']
percentTats = float(raw_input("percentage of time spent playing video games?"))
ffMiles = float(raw_input("frequent flier miles earned per year?"))
iceCream = float(raw_input("liters of ice cream consumed per year?"))
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
inArr = array([ffMiles,percentTats,iceCream])
classifierResult = classify0((inArr-minVals)/ranges,normMat,datingLabels,3)
print('you will probably like this person:',resultList[classifierResult - 1])
>>> knn.datingClassTest()
('lsp: ', 3)
the classifier came back with: 3 ,the real answer is: 3
the totle error rate is : 0.000000
('lsp: ', 2)
the classifier came back with: 2 ,the real answer is: 2
the totle error rate is : 0.000000
('lsp: ', 1)
the classifier came back with: 1 ,the real answer is: 1
the totle error rate is : 0.000000
.
.
.
the classifier came back with: 2 ,the real answer is: 2
the totle error rate is : 0.040000
('lsp: ', 1)
the classifier came back with: 1 ,the real answer is: 1
the totle error rate is : 0.040000
('lsp: ', 3)
the classifier came back with: 3 ,the real answer is: 1
the totle error rate is : 0.050000
>>>
错误率0.5%,还不错
现在给我们一个人,让我们来看看这个人是否是海伦喜欢的类型:
测试代码如下
>>> knn.classifyPerson()
percentage of time spent playing video games?10
frequent flier miles earned per year?10000
liters of ice cream consumed per year?0.5
('you will probably like this person:', 'in a small doses')#小兴趣
>>> knn.classifyPerson()
percentage of time spent playing video games?10
frequent flier miles earned per year?30000
liters of ice cream consumed per year?0.5
('you will probably like this person:', 'in large doses')#特别有兴趣
这这里面会用到一个datingTestSet2.txt文件,可以去官网上下载,当然也可以向我要。不知道怎么传上来,就不传了。