作为经典的机器学习方法,网上有很多调用sklearn库的SVM接口进行手写数字识别的教程,本文主要采用Numpy从零开始实现线性SVM,以帮助读者了解SVM的实现原理。本文不涉及到太多理论性的内容,纯粹只是代码的实现,注释也不多,后面如果时间允许的话写一篇SVM的原理和代码实现的结合,帮助初学者更快地入门机器学习。
一、SVM简介
1.1 定义
支持向量机(Support Vector Machine, SVM)是一类按监督学习(supervised learning)方式对数据进行二元分类的广义线性分类器(generalized linear classifier),其决策边界是对学习样本求解的最大边距超平面(maximum-margin hyperplane)。
1.2 求解目标
对于给定的训练数据,对其类别的判定主要依据以下两个公式:
在规范化下,超平面的几何间隔为:
于是,找最大几何间隔的超平面表述成如下的最优化问题:
二、代码
2.1 引入依赖包
import numpy as np
import random
import struct
2.2 定义数据集读取函数
#读取数据
trainimage_path="../data/MNIST/raw/train-images-idx3-ubyte"
trainlabel_path="../data/MNIST/raw/train-labels-idx1-ubyte"
testimage_path="../data/MNIST/raw/t10k-images-idx3-ubyte"
testlabel_path="../data/MNIST/raw/t10k-labels-idx1-ubyte"
def getimage(filepath):#将二进制文件转换成像素特征的数据
readfile= open(filepath, 'rb') #以二进制方式打开文件
file= readfile.read()
readfile.close()
index = 0
nummagic,numimgs,numrows,numcols=struct.unpack_from(">iiii",file,index)
index += struct.calcsize("iiii")
images = []
for i in range(numimgs):
imgval = struct.unpack_from(">784B", file, index)
index += struct.calcsize("784B")
imgval = list(imgval)
for j in range(len(imgval)):
if imgval[j] > 1:
imgval[j] = 1
images.append(imgval)
return np.array(images)
def getlabel(filepath):
readfile = open(filepath, 'rb')
file = readfile.read()
readfile.close()
index = 0
magic, numitems = struct.unpack_from(">ii", file, index)
index += struct.calcsize("ii")
labels = []
for x in range(numitems):
im = struct.unpack_from(">1B", file, index)
index += struct.calcsize("1B")
labels.append(im[0])
return np.array(labels)
trainimage=getimage(trainimage_path)
trainlabel=getlabel(trainlabel_path)
testimage=getimage(testimage_path)
testlabel=getlabel(testlabel_path)
#采用两两分类的方式,共有9*10/2=45个分类器
num_1 = 9
num_2 = num_1 + 1
由于SVM是二分类,所以有两类方案
- 构建10个分类器,每个分类器实现对单个数字的识别,以识别0为例,此时正样本为包含0的所有样本,负样本则为1~9数字的样本;
- 构建10*9/2=45个分类器,每个分类器实现两类数字的识别,比如第一个分类器实现0与1的识别,0为正样本,1为负样本。
本文采用第二种方案。
2.3 定义SVM分类器SVC
class Svc(object):
def __init__(self,xdata,ydata,c=100,b=0):#初始化函数
self.c=c
self.b=b
self.xdata=xdata
self.ydata=ydata
self.alpha=np.ones(len(ydata))
def kernelmat(self):#核矩阵
kmat = np.dot(self.xdata, self.xdata.T)
return kmat
def ui(self,i,kmat): # 求ui
a = np.sum(self.alpha*self.ydata*kmat[:,i])+self.b
return a
def Ei(self,i,kmat): # 求Ei=ui-yi
a = self.ui(i,kmat) - self.ydata[i]
return a
def alpha2(self,i, tflist,kmat): # 找第二个alpha2在alpha向量中的位置,通过max|Ei-Ej|
ei = self.Ei(i,kmat)
temp = np.abs(np.sum(self.alpha*self.ydata*kmat,1) + self.b - self.ydata - ei)
temp = temp * tflist
return np.argmax(temp)
def eta(self, i, j,kmat): # 求分母eta
a = kmat[i,i]+kmat[j,j]-2*kmat[i,j]
return a
def alpha2new(self,i, j,kmat): # 求alpha2new,这里直接做约束
a = self.alpha[j] + self.ydata[j] * (self.Ei(i,kmat) - self.Ei(j,kmat)) / self.eta(i, j,kmat)
if self.ydata[i] == self.ydata[j]:
L = np.max([0, self.alpha[i] + self.alpha[j] - self.c])
H = np.min([self.c, self.alpha[i] + self.alpha[j]])
if a > H:
return H
elif a < L:
return L
else:
return a
else:
L = np.max([0, self.alpha[j] - self.alpha[i]])
H = np.min([self.c, self.c + self.alpha[j] - self.alpha[i]])
if a > H:
return H
elif a < L:
return L
else:
return a
def alpha1new(self,i, j,kmat): # 把alpha2new带进去求alpha1new
a = self.alpha[i] + self.ydata[i] * self.ydata[j] * (self.alpha[j] - self.alpha2new(i,j,kmat))
return a
def bnew(self,i,j,kmat): # 更新b
ei = self.Ei(i,kmat)
ej = self.Ei(j,kmat)
yi = self.ydata[i]
yj = self.ydata[j]
alphai = self.alpha1new(i,j,kmat)
alphaj = self.alpha2new(i,j,kmat)
b1 = self.b - ei - yi * (alphai - self.alpha[i]) * kmat[i,i]- yj * (alphaj - self.alpha[j]) * kmat[i,j]
b2 = self.b - ej - yi * (alphai - self.alpha[i]) * kmat[i,j] - yj * (alphaj - self.alpha[j]) * kmat[j,j]
if alphai > 0 and alphai < self.c:
return b1
elif alphaj > 0 and alphaj < self.c:
return b2
else:
return (b1 + b2) / 2
def sign(self,x): # 符号函数
if x > 0:
return 1
elif x < 0:
return -1
else:
return 0
def acc(self,kmat): # 计算正确率,判断函数
y_hat = np.sum(self.alpha*self.ydata*kmat,1)+self.b
y_pre = np.zeros_like(y_hat)
y_pre[y_hat > 0] = 1
y_pre[y_hat < 0] = -1
a = np.sum(y_pre == self.ydata)
return a / len(self.xdata)
def al(self,kmat): # 训练函数输出alpha,b
alphav = self.alpha.copy()
while self.acc(kmat) < 0.90:
tflist = np.zeros(len(self.alpha))
temp = self.ydata * (np.sum(self.alpha*self.ydata*kmat,1)+self.b)
tflist[temp<1 | ((temp==1) & (self.alpha == 0)) | ((temp > 1) & (self.alpha != 0))] = True
for i in range(len(self.alpha)):
if tflist[i] == True:
j = self.alpha2(i,tflist,kmat)
t = self.alpha2new(i,j,kmat)
alphav[j] = t
alphav[i] = self.alpha1new(i,j,kmat)
self.b = self.bnew(i,j,kmat)
self.alpha = alphav
if (self.acc(kmat) < 0.90)==False:
return self.alpha, self.b
return self.alpha,self.b
2.4 定义训练与测试函数
def xydata(xdata,ydata,i,j):#提取i和j类的数据
x_pos = xdata[ydata == i,:]
y_pos = np.ones_like(ydata[ydata == i])
x_neg = xdata[ydata == j,:]
y_neg = np.ones_like(ydata[ydata == j]) * -1
return np.concatenate((x_pos,x_neg),axis=0),np.concatenate((y_pos,y_neg))
def train(xdata,ydata):
xdatas=[]
ydatas=[]
names=[]#储存ij标签
alphas=[]#储存alpha
bs=[]#储存b
for i in range(num_1):
for j in range(i+1,num_2):
x,y=xydata(xdata,ydata,i,j)
xdatas.append(x)
ydatas.append(y)
s=Svc(xdata,ydata)
s.xdata = x
s.ydata = y
kmat=s.kernelmat()
s.alpha=np.ones(len(s.xdata))
aa,bb=s.al(kmat)
names.append([i, j])
alphas.append(aa)
bs.append(bb)
print([i,j],len(x))
return xdatas,ydatas,names,alphas,bs
def test(xdatas, ydatas, xdata, ydata, names, alphas, bs):
a=np.zeros((len(xdata),int(num_1*num_2/2)),dtype=np.int32)
for j in range(int(num_1*num_2/2)):
b = np.dot(np.array(xdatas[j]),np.array(xdata).T)
b = np.multiply(b,np.expand_dims(np.array(ydatas[j]),0).T)
b = np.multiply(b,np.expand_dims(np.array(alphas[j]),0).T)
b = np.sum(b,0) + bs[j]
a[b.T > 0, names[j][0]] += 1
a[b.T <=0, names[j][1]] += 1
n=(a.argmax(1) ==ydata)
return n.sum()/len(xdata)
2.5、主程序
由于SVM求解时间较长,因此仅取前4000个样本进行超平面的求解,测试精度大概77%。
def main():
xdatas,ydatas, names, alphas, bs=train(trainimage[:4000,:],trainlabel[:4000])
testresult=test(xdatas, ydatas, testimage,testlabel,names,alphas,bs)
print(testresult)
if __name__=="__main__":
main()