说明
- 未采用sklearn自带的knn算法(当时得分96.800%)进行建模
- 改进大神的代码(96.400% )提高到96.886%
代码
from numpy import *
import operator
import csv
def toInt(array):
array=mat(array)
m,n=shape(array)
newArray=zeros((m,n))
for i in xrange(m):
for j in xrange(n):
newArray[i,j]=int(array[i,j])
return newArray
def loadTrainData():
l=[]
with open('train.csv') as file:
lines=csv.reader(file)
for line in lines:
l.append(line)
l.remove(l[0])
l=array(l)
label=l[:,0]
data=l[:,1:]
return toInt(data),toInt(label)
def loadTestData():
l=[]
with open('test.csv') as file:
lines=csv.reader(file)
for line in lines:
l.append(line)
l.remove(l[0])
data=array(l)
return toInt(data)
def classify(inX, dataSet, labels, k):
inX=mat(inX)
dataSet=mat(dataSet)
labels=mat(labels)
dataSetSize = dataSet.shape[0]
diffMat = tile(inX, (dataSetSize,1)) - dataSet
sqDiffMat = array(diffMat)**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort()
classCount={}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i],0]
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
def saveResult(result):
with open('result.csv','wb') as myFile:
myWriter=csv.writer(myFile)
for i in result:
tmp=[]
tmp.append(i)
myWriter.writerow(tmp)
def handwritingClassTest():
trainData,trainLabel=loadTrainData()
testData=loadTestData()
m,n=shape(testData)
resultList=[]
for i in range(m):
classifierResult = classify(testData[i], trainData, trainLabel.transpose(), 5)
resultList.append(classifierResult)
saveResult(resultList)
handwritingClassTest()
7月12日
- 源代码未变,将k值设置为3,准确率提高到了96.929%