KNN 又名K近邻算法
用于:客户流失预测、欺诈检测等。
算法思想:近朱者赤近墨者黑!
步骤:
1. 算距离、计算新数据和训练数据之间的距离
2. 去排序、对算出来的距离进行排序
3. 找邻居、确定最近的K个训练对象
4. 做分类、根据K个近邻对象归属的类最高的为对测试对象分类
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 3 09:53:27 2017
@author: steve
"""
import sys
sys.path.append("C:\\Users\\Administrator.PC-201707110905\\Desktop")
from numpy import *
import operator
# 倒入数据
def createDataSet():
group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
labels = ['A', 'A', 'B', 'B']
return group, labels
# KNN算法
def classify0(Input, training_data, labels, k):
training_data_size = training_data.shape[0]
InputMatrix = tile(Input, (training_data_size, 1)) - training_data
InputMatrix_square = InputMatrix**2
distances = InputMatrix_square.sum(axis=1)**0.5
sorted_distances_indices = distances.argsort()
class_count = {}
for i in range(k):
dict_key = labels[sorted_distances_indices[i]]
class_count[dict_key] = class_count.get(dict_key,0) + 1
sorted_class_count = sorted(class_count.items(), key=operator.itemgetter(1),
reverse = True)
return sorted_class_count[0][0]
# 完成这个上面的函数,就可以试试1,2, 分类到哪个啦。
# >>> import knn
# >>> group, labels = knn.createDataSet()
# >>> knn.classify0([1,2], group, labels, 3)
# 'A'
# 读取数据
def file2matrix(filename):
fr = open(filename)
arrayOnlines = fr.readlines()
numberOfLines = len(arrayOnlines)
returnMat = zeros((numberOfLines,3))
classLabelVector = []
index = 0
for line in arrayOnlines:
line = line.strip()
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat, classLabelVector
# 完成这个上面的函数,就可以对把数据转化成需要用的啦
# >>> datingDataMat, datingLabels = knn.file2matrix('datingTestSet2.txt')
# >>> datingDataMat
# array([[ 4.09200000e+04, 8.32697600e+00, 9.53952000e-01],
# [ 1.44880000e+04, 7.15346900e+00, 1.67390400e+00],
# [ 2.60520000e+04, 1.44187100e+00, 8.05124000e-01],
# ...,
# [ 2.65750000e+04, 1.06501020e+01, 8.66627000e-01],
# [ 4.81110000e+04, 9.13452800e+00, 7.28045000e-01],
# [ 4.37570000e+04, 7.88260100e+00, 1.33244600e+00]])
# >>> datingLabels[1]
# 2
# 归一化特征值:因为数值差值较大的属性对计算结果影响较大
def autoNorm(dataSet):
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - tile(minVals, 1)
normDataSet = normDataSet / tile(ranges, (m, 1))
return normDataSet, ranges, minVals
# 完成这个上面的函数,就可以进行归一化特征啦
# 特征归一原因是:数字差值较大的属性对结果影响较大
# 特征归于的手段:将所有数据转化成0-1的数值
# >>> import knn
# >>> datingDataMat, datingLabels = knn.file2matrix('datingTestSet2.txt')
# >>> normMat,ranges, minVals= knn.autoNorm(datingDataMat)
# >>> normMat
# array([[ 0.44832535, 0.39805139, 0.56233353],
# [ 0.15873259, 0.34195467, 0.98724416],
# [ 0.28542943, 0.06892523, 0.47449629],
# ...,
# [ 0.29115949, 0.50910294, 0.51079493],
# [ 0.52711097, 0.43665451, 0.4290048 ],
# [ 0.47940793, 0.3768091 , 0.78571804]])
# >>> ranges
# array([ 9.12730000e+04, 2.09193490e+01, 1.69436100e+00])
# >>> minVals
# array([ 0. , 0. , 0.001156])
# >>>
# 分类器针对约会网站得测试代码
def datingClassTest():
hoRatio = 0.10
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt') # there is no '2' in intitial book
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m*hoRatio)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:],
normMat[numTestVecs:m,:],datingLabels[numTestVecs:m], 3)
print("the classifier came back with: %d, the real answer is: %d"
% (classifierResult, datingLabels[i]))
if (classifierResult != datingLabels[i]):errorCount += 1.0
print("the total error rate is: %f" % (errorCount / float(numTestVecs)))
print(errorCount)
return classifierResult
# 完成这个上面的函数,就可以进行测试啦
# 测试一般拿90%做训练数据,用10% 做测试数据
# >>> import knn
# >>> knn.datingClassTest()
# the classifier came back with: 3, the real answer is: 3
# the classifier came back with: 2, the real answer is: 2
# the classifier came back with: 1, the real answer is: 1
# the classifier came back with: 1, the real answer is: 1
# the classifier came back with: 1, the real answer is: 1
# the classifier came back with: 1, the real answer is: 1
# .
# .
# .
# the classifier came back with: 1, the real answer is: 1
# the classifier came back with: 3, the real answer is: 3
# the classifier came back with: 3, the real answer is: 3
# the classifier came back with: 2, the real answer is: 2
# the classifier came back with: 1, the real answer is: 1
# the classifier came back with: 3, the real answer is: 1
# the total error rate is: 0.050000
# 5.0
# 3
# >>>
# 构建完整可用系统
def classifyPerson():
resultList = ['not at all', 'in small doses', 'in large doses']
percentTats = float(input("percentage of time spent playing video games?"))
ffMiles = float(input("frequent flier miles earned per years?"))
iceCream = float(input("liters of ice cream consumed per year?"))
datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
inArr = array([ffMiles, percentTats, iceCream])
classifierResult = classify0((inArr-minVals)/ranges,normMat,datingLabels, 3)
print("you will probably like this person:", resultList[int(classifierResult) - 1])
# 完成这个上面的函数 之后就可以进行预测啦!!!
# >>> import knn
# >>> knn.classifyPerson()
# percentage of time spent playing video games?5
# frequent flier miles earned per years?5000
# liters of ice cream consumed per year?0.2
# you will probably like this person: in small doses
有关数据的格式咱们来看看
# 对原始数据画出散点图
import matplotlib
import matplotlib.pyplot as plt
from numpy import *
fig = plt.figure
ax = fig.add_subplot(111)
datingDataMat, datingLabels = knn.file2matrix('datingTestSet2.txt')
ax.scatter(datingDatMat[:,1],datingDatMat[:,2],
15.0*array(datingLabels), 15.0*array[datingLabels])
plt.show()
手写字识别系统
步骤:
- 收集数据、提供文本文件
- 准备数据、编写函数img2vector(),将图像格式转换为分类器使用 的向量格式
- 分析数据、python命令行中检查数据,检测可行性
- 测试算法、使用部分数据作为测试样本,给出错误率
数据:trainingDigits 2000例子,每个数字有200左右样本。
testDigits 900例子(测试样本)。
稍等啊兄弟姐妹们。。。目前先做另一个项目,那个要的太急。
转载和疑问声明
如果你有什么疑问或者想要转载,没有允许是不能转载的哈
赞赏一下能不能转?哈哈,联系我啊,我告诉你呢 ~~
欢迎联系我哈,我会给大家慢慢解答啦~~~怎么联系我? 笨啊~ ~~ 你留言也行
如果有同学也喜欢的话,欢迎大家加入QQ群交流,群号:533256556;
请备注:csdn
你关注微信公众号1.机器学习算法工程师:2.或者扫那个二维码,后台发送 “我要找朕”,联系我也行啦!
(爱心.gif) 么么哒 ~么么哒 ~么么哒
码字不易啊啊啊,如果你觉得本文有帮助,三毛也是爱!