def file2matrix(filename):
fr = open(filename)
arrayOLines=fr.readlines()
numberOfLines = len(arrayOLines) #获取 n=样本的行数,1000个样本
returnMat = zeros((numberOfLines,4)) #创建一个2维矩阵用于存放训练样本数据,一共有n行,每一行存放3个数据
classLabelVector = [] #创建一个1维数组用于存放训练样本标签。
classLabelVectortou = []
index = 0
for line in arrayOLines:
# 把回车符号给去掉,头尾空白去掉
line = line.strip()
# 把每一行数据用\t分割
listFromLine = line.split('\t')
# 把分割好的数据放至数据集,其中index是该样本数据的下标,就是放到第几行
returnMat[index,:] = listFromLine[1:5]
# 把该样本对应的标签放至标签集,顺序与样本集对应。
classLabelVector.append(int(listFromLine[-1]))#每行的最后一个下标位置数据是类型
classLabelVectortou.append(int(listFromLine[0]))
index += 1
return returnMat,classLabelVector
测试数据预处理
def file3matrix(filename):
"""
从文件中读入训练数据,并存储为矩阵
"""
fr = open(filename)
arrayOLines=fr.readlines()
numberOfLines = len(arrayOLines) #获取 n=样本的行数,1000个样本
returnMat = zeros((numberOfLines,4)) #创建一个2维矩阵用于存放训练样本数据,一共有n行,每一行存放3个数据
classLabelVector = [] #创建一个1维数组用于存放训练样本标签。
classLabelVectortou = []
index = 0
for line in arrayOLines:
# 把回车符号给去掉,头尾空白去掉
line = line.strip()
# 把每一行数据用\t分割
listFromLine = line.split('\t')
# 把分割好的数据放至数据集,其中index是该样本数据的下标,就是放到第几行
returnMat[index,:] = listFromLine[1:5]
# 把该样本对应的标签放至标签集,顺序与样本集对应。
classLabelVector.append(int(listFromLine[-1]))#每行的最后一个下标位置数据是类型
classLabelVectortou.append(int(listFromLine[0]))
index += 1
return returnMat
数据显示图像
def pictureshow(datingDataMat):
datingDataMat1=[]
datingDataMat1=datingDataMat
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(datingDataMat[:,1], datingDataMat[:,2],30.0*array(datingLabels), 15.0*array(datingLabels))
ax.axis([-2,25,-0.2,2.0])
plt.xlabel('Percentage of Time Spent Playing Video Games')
plt.ylabel('Liters of Ice Cream Consumed Per Week')
plt.show()
def datingClassTest():
# 将数据集中10%的数据留作测试用,其余的90%用于训练
hoRatio = 0.8
datingDataMat,datingLabels = file2matrix('D:\MachingLearning\KNN\machingKNN1.0\draining1.txt') #load data setfrom file
print datingDataMat
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m*hoRatio)
errorCount = 0.0
#print normMat[0],normMat[0,:],normMat[0,:1],normMat[0,:2]
#normMat[0]代表list中的下标0,normMat[0,:]代表嵌套list的外部下标0,normMat[0,:1]代表嵌套list下标0的同时,截取下标1前面的数
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],4)
#print "the classifier came back with: %d, the real answer is: %d, result is :%s" % (classifierResult, datingLabels[i],classifierResult==datingLabels[i])
if (classifierResult != datingLabels[i]):
errorCount += 1.0
print numTestVecs
print "the total error rate is: %f" % (1-errorCount/float(numTestVecs))
print errorCount
判断输入数据
def classifyPerson():
a=0
b=0
c=0
resultList=['not','pai','pay']
datingDataMat,datingLabels=file2matrix('D:\MachingLearning\KNN\machingKNN1.0\draining1.txt')
print datingDataMat
normMat,ranges,minVals=autoNorm(datingDataMat)
datingDataMat1=file3matrix('D:\MachingLearning\KNN\machingKNN1.0\dest1.txt')
normMat1,ranges1,minVals1=autoNorm(datingDataMat1)
print len(normMat1)
for i in range(len(normMat1)):
classifierResult=classify0((normMat1[i]-minVals)/ranges,normMat,datingLabels,5)
#print "result:",resultList[classifierResult-1]
if resultList[classifierResult-1]=='pai':
a+=1
if resultList[classifierResult-1]=='pay':
b+=1
if resultList[classifierResult-1]=='not':
c+=1
print 'pai=',a
print 'pay=',b
print 'not=',c
#return resultList[classifierResult-1]
return a
函数调用
if __name__=="__main__":
datingDataMat,datingLabels=file2matrix('D:\MachingLearning\KNN\machingKNN1.0\draining.txt')
pictureshow(datingDataMat)
#autoNorm(datingDataMat)
#datingClassTest()
classifyPerson()