# 机器学习-Kaggle竞赛-Digit recognizer

train.csv
train.csv里面是42000*785的数据

test.csv
train.csv里面是28000*784的数据

## KNN

def csv2vector(filename,index=0):
fr = open(filename)
filelines = fr.readlines()
del filelines[0]
lenlines = len(filelines)
returnVect = zeros((lenlines,784))
labellist = [0]*lenlines
for i in range(lenlines):
lineStr = filelines[i]
linearr = lineStr.split(',')
if len(linearr)< 784:
continue
labellist[i] = linearr[0]
for j in range(index,len(linearr)):
if linearr[j] != '0':
returnVect[i,j-index] = 1
else:
returnVect[i,j-index] = 0
fr.close()
return returnVect,labellist

>>> trainSet,trainlabel = knn.csv2vector(“train.csv”,1)
>>> knn.testrun(trainSet[20000:30000],trainlabel[20000:30000],trainSet[0:20000],trainlabel[0:20000])

def testknn(testSet,testlabel,trainSet,trainlabel):
start = time.clock()
num=0
for i in range(len(testSet)):
tmp =  classify0(testSet[i],trainSet,trainlabel,3)
if tmp != testlabel[i]:
num = num + 1
print tmp,",",testlabel[i]

print "error:",num
print "error percent:%f" % (float(num)/len(testSet))
end = time.clock()
print "time cost: %f s" % (end - start)
print "end"

def classify0(inx,dataSet,labels,k):
datanum=dataSet.shape[0]
inxtemp=tile(inx,(datanum,1))-dataSet   #矩阵相减
sqinxtemp=inxtemp**2
sqdistance=sqinxtemp.sum(axis=1)
distance=sqdistance**0.5
sortdist=distance.argsort()
classset={}
for i in range(k):
labeltemp = labels[sortdist[i]]
classset[labeltemp]=classset.get(labeltemp,0)+1
sortedclassset=sorted(classset.iteritems(),key=lambda d:d[1],reverse=True)#对字典中的数据排序
return sortedclassset[0][0]

## Bayes

def trainNB0(trainMatrix,trainclass):
numpics = len(trainMatrix)  #record numbers
numpix = len(trainMatrix[0])#pix numbers
pDic={}
for v in trainclass:
pDic[v] = pDic.get(v,0)+1
for k,v in pDic.items():
pDic[k]=v/float(numpics)#p of every class
pnumdic={}
psumdic={}
for k in pDic.keys():
pnumdic[k]=ones(numpix)
for i in range(numpics):
pnumdic[trainclass[i]] += trainMatrix[i]
psumdic[trainclass[i]] = psumdic.get(trainclass[i],2) + sum(trainMatrix[i])
pvecdic={}
for k in pnumdic.keys():
pvecdic[k]=log(pnumdic[k]/float(psumdic[k]))
return pvecdic,pDic

def classifyNB(vec2class,pvecdic,pDic):
presult={}
for k in pDic.keys():
presult[k]=sum(vec2class*pvecdic[k])+log(pDic[k])
tmp=float("-inf")
result=""
for k in presult.keys():
if presult[k]>tmp:
tmp= presult[k]
result=k
return result

def testNB():
print "load train data..."
trainSet, trainlabel=csv2vector("train.csv",1)
print "load test data..."
testSet,testlabel = csv2vector("test.csv")
print "start train..."
pvecdic,pDic=trainNB0(trainSet, trainlabel)
start = time.clock()
print "start test..."
result="ImageId,Label\n"
for i in range(len(testSet)):
tmp = classifyNB(testSet[i],pvecdic,pDic)
result += str(i+1)+","+tmp+"\n"
#print tmp
savefile(result,"result_NB.csv")
end = time.clock()
print "time cost: %f s" % (end - start)

## Logistic Regression

def train(data,labels):
print "train start"
start = time.clock()
threads = []
for i in range(10):#开启10个线程计算
t = threading.Thread(target=trainweight,args=(data,labels,i))
threads.append(t)
for i in range(len(threads)):
threads[i].start()
print "thread",i," start"
for i in range(len(threads)):
threads[i].join()
print "thread",i," end"
print "train end"
end = time.clock()
print "train time cost: %f s" % (end - start)
return g_we_list
g_we_list=[0,0,0,0,0,0,0,0,0,0]
def trainweight(data,labels,tag):
we = getweight(data,labels,str(tag))
g_we_list[tag]=we

def getlabels(labels,tag):
labels_ = list(labels)
for i in range(len(labels_)):
if labels_[i] ==tag:
labels_[i]=1
else:
labels_[i]=0

return labels_
def getweight(testdata,testlabels,tag):
labels=getlabels(testlabels,tag)
we = stocalcgrand1(testdata,labels)
we = mat(we).transpose()
return we

def stocalcgrand1(dataMatin,labelMatin,numiter=100):
m,n=shape(dataMatin)
alpha=0.01
weight=ones(n)
for i in range(numiter):
dataIndex=range(m)
for j in range(m):
alpha=0.005/(1.0+i)+0.005
randIndex=int(random.uniform(0,len(dataIndex)))
h=sigmod(sum(dataMatin[randIndex]*weight))
error=labelMatin[dataIndex[randIndex]]-h
weight=weight+alpha*error*dataMatin[randIndex]
del(dataIndex[randIndex])
return weight

def test(testData,testlabel,we_list):
error = 0
for i in range(len(testData)):
#for j in range(len(we_list)):
rec = classfy(testData[i],we_list)
if testlabel[i] != str(rec):
error = error+1
print testlabel[i],",",rec

print "error=",error
print "error percent:%f" % (float(error)/len(testData))
def classfy(testData,we_list):
tmp=0
labels = -1
for i in range(len(we_list)):#根据sigmod的值大小来确定是哪一类
sg = sigmod(testData*we_list[i])
if sg>tmp:
tmp = sg
labels = i
return labels


3w的数据来训练回归系数。效果也一般最终也没提交~(没信心了~~ -.-

## svm

scikit-learn包是python的一套机器学习包，包含了很多现成的机器学习算法。官网有很多的例子~用起来很方便。

import numpy as np
from sklearn.svm import SVC

def train(testdata,testlabels):
clf=SVC()
clf.fit(testdata,testlabels)
print "svn train success."
return clf

def test(clf,test,label):
testsize=len(test)
num=0
for i in range(testsize):
tmp = clf.predict(test[i])
if label[i] != tmp[0]:
num = num +1
print label[i],",",tmp[0]
print "error:",num
print "error percent:%f" % (float(num)/testsize)