ml的学习笔记。
之前学的知识都较为零散,现在需要系统的学习各种算法和思想,目前主要练习内容是基于《机器学习实战》。
在学习过程中,除了学习算法的本身之外,对python库的运用和矩阵运算的代码风格是另外一个学习的重心,c风格的编程习惯如果放到python里远不如矩阵运算和内置函数的效率高,学习过程 的一些记录在代码里以注释方式表示。
kNN(k近邻)为无监督算法,原理即对每个测试数据向量,计算它与训练数据中每个点的距离,选择距离最小的前k个点,统计这k个点的标签,给该测试数据打上统计标签中出现频率最高的标签。
kNN.py
from numpy import *
import operator
from os import listdir
#创建一个简单的测试数据矩阵和标签向量
def createDataSet():
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels = ['A','A','B','B']
return group, labels
#分类器
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = tile(inX, (dataSetSize,1))-dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1) #axis=1表示矩阵的每行相加
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort() #从小到大排序,返回的矩阵存储的是原索引号
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
sortedClassCount = sorted(classCount.items(), key=lambda d:d[1], reverse = True)
#sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reversed=True)
return sortedClassCount[0][0]
#数据归一化:newValue = (oldValue - min)/(max - min)
def autoNorm(dataSet):
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - tile(minVals, (m,1))
normDataSet = normDataSet/tile(ranges, (m,1))
return normDataSet,ranges,minVals
#将文件中存储好的数据转换为内存中的矩阵
def file2matrix(filename):
fr = open(filename)
arrayOLines = fr.readlines()
numberOfLines = len(arrayOLines)
returnMat = zeros((numberOfLines,3))
classLabelVector = []
index = 0
for line in arrayOLines:
line = line.strip() #除去回车符号
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1])) # -1取最后一列元素
index = index + 1
return returnMat,classLabelVector
def datingClassTest():
hoRatio = 0.10
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
normMat, ranges ,minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m*hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],datingLabels[numTestVecs:m],3)
print("the classifier came back with :%d,the real answer is: %d" %(classifierResult,datingLabels[i]))
if(classifierResult != datingLabels[i]):
errorCount =errorCount + 1.0
print("the total error rate is : %f" %(errorCount/float(numTestVecs)))
datingClassTest()
'''
#测试kNN分类器
group,labels = createDataSet()
print(classify0([0,0], group, labels, 3))
'''
'''
data=array([[1,1,3],[1,2,3],[1,0,3]])
data1=array([[3,2,1],[3,2,1],[1,0,3]])
diffmat = data - data1
print(diffmat)
print(diffmat**2)
sumdiffmat = diffmat**2
sumvec = sumdiffmat.sum(axis=1)#矩阵每行相加
print(sumvec)
print(sumvec.argsort())
'''
'''
#字典的运用
classCount= {}
a='A'
classCount[a] = 1
classCount[a] = classCount[a] + 1
classCount[a] = classCount.get(a,0) + 1
classCount['b']=2
classCount['c']=1
#python2
#sortedDscendCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reversed=True)
#python3
sortedDscendCount = sorted(classCount.items(), key=lambda d:d[1], reverse = True)
print(sortedDscendCount)
sortedAscendCount = sorted(classCount.items(), key=lambda d:d[1], reverse = False)
print(sortedAscendCount)
'''
'''
group,labels = createDataSet()
print(group)
print(labels)
'''
plot.py 测试Matplotlib创建散点图
import matplotlib
import matplotlib.pyplot as plt
import ch1.kNN as knn
returnMat,classLabelVector = knn.file2matrix("datingTestSet2.txt")
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(returnMat[:,1],returnMat[:,2])
plt.show()
test.py 学习的一些库函数的知识
#在python 3.2.3中 input和raw_input 整合了,没有了raw_input,input返回字符串型
'''
整数字符串转换为对应的整数
int('12')
小数字符串转换为对应小数
float('12.34')
数字转换为字符串
str(123.45)
ASCII码转换为相应字符
chr(97)
字符转换为响应ASCII码
ord('a')
'''
'''
#测试listdir,可得到文件夹下一层所有文件和文件夹的名字
from os import listdir
filelist = listdir("D:/testdata")
m = len(filelist)
print(m)
for i in filelist:
print(i)
fileStr = i.split('.')
print(fileStr[0])
#print(fileStr[1])
'''
'''
a = input("hello :")
print(a)
c=2+int(a)
print(c)
'''