#coding:utf-8
importrequests, json, time, re, os, sys, timeimporturllib2importrandomimportnumpy as np#设置为utf-8模式
reload(sys)
sys.setdefaultencoding("utf-8")#读取文本文件,构建二维数组
defreadDataFile(filename,format):ifformat:pass
else:
format= ','list=[]#去除首位空格
filename =filename.strip()#判断数据文件是否存在
ifos.path.isfile(filename):passfile_object= open(filename,'rb')
lines=file_object.readlines()for line inlines:
tmp=[]
line=line.strip()for value in line.split(format)[:-1]:
tmp.append(float(value))
tmp.append(line.split(format)[-1])
list.append(tmp)else:print "%s is not exists" %(filename)returnlist#读取文本数据,拆分原始数据为特征和标签,返回特征值和标签值
def createData(filename,format=','):
data_label=readDataFile(filename,format)if len(data_label) >0:
label=[]
data=[]#data_label = [[1,100,123,'A'],[2,99,123,'A'],[100,1,12,'B'],[99,2,23,'B']]
for each indata_label:
label.append(each[-1])
data.append(each[:-1])returndata,label#根据输入数据和测试数据,进行分类
defcalculateDistance(input,data,label,k):
classes= 'Error'
if len(data[0])==0 or len(label) ==0:print 'data or label is null'
pass
elif k >len(data) :print "k : %s is out of bounds" %(k)pass
elif len(input) <>len(data[0]):print "特征变量值不够,输入变量特征个数为:%s,训练特征变量个数为:%s" %(len(input),len(data[0]))pass
else:
result=[]
length=len(input)for i inrange(len(data)):
sum=0for j inrange(length):#pow(5,2) 标识5的平方为25,取两点之间的距离的平方并累加
sum = sum + pow(input[j] - data[i][j],2)#取平方根
sum = pow(sum,0.5)
result.append(sum)#print result
result =np.array(result)#argsort()根据元素的值从小到大对元素进行排序,返回下标
sortedDistIndex =np.argsort(result)#统计前k个数中各个标签的个数
classCount={}for i inrange(k):
voteLabel=label[sortedDistIndex[i]]###对选取的K个样本所属的类别个数进行统计
#dict.get(key, default=None) 返回指定键的值,如果值不在字典中返回默认值None。
classCount[voteLabel] = classCount.get(voteLabel,0) + 1
###选取出现的类别次数最多的类别
maxCount =0for key,value inclassCount.items():if value >maxCount:
maxCount=value
classes=keyreturnclasses
filename= '/home/shutong/jim/crawl/data.csv'data,label=createData(filename)
input= [1,20]
k= 4result=calculateDistance(input,data,label,k)print input,result