SVM实现
SVM(支持向量机)是一个常用的分类器,我在SVM中介绍了该方法的大概的公式以及出发点。很多机器学习课程也讲了SVM的公式,那么具体怎么实现一个SVM呢?这里介绍一下这个git repo里面的SVM实现:SVM代码。首先我想将SVM的公式放到下面:
传统SVM公式:
改进后的SVM(不严格要求所有点必须满足大于margin,对于离群点不那么敏感):
SVM实现的代码如下:
from __future__ import division, print_function
import numpy as np
import cvxopt
from mlfromscratch.utils import train_test_split, normalize, accuracy_score
from mlfromscratch.utils.kernels import *
from mlfromscratch.utils import Plot
# Hide cvxopt output
cvxopt.solvers.options['show_progress'] = False
class SupportVectorMachine(object):
"""The Support Vector Machine classifier.
Uses cvxopt to solve the quadratic optimization problem.
Parameters:
-----------
C: float
Penalty term.
kernel: function
Kernel function. Can be either polynomial, rbf or linear.
power: int
The degree of the polynomial kernel. Will be ignored by the other
kernel functions.
gamma: float
Used in the rbf kernel function.
coef: float
Bias term used in the polynomial kernel function.
"""
def __init__(self, C=1, kernel=rbf_kernel, power=4, gamma=None, coef=4):
self.C = C
self.kernel = kernel
self.power = power
self.gamma = gamma
self.coef = coef
self.lagr_multipliers = None
self.support_vectors = None
self.support_vector_labels = None
self.intercept = None
def fit(self, X, y):
n_samples, n_features = np.shape(X)
# Set gamma to 1/n_features by default
if not self.gamma:
self.gamma = 1 / n_features
# Initialize kernel method with parameters
self.kernel = self.kernel(
power=self.power,
gamma=self.gamma,
coef=self.coef)
# Calculate kernel matrix
kernel_matrix = np.zeros((n_samples, n_samples))
for i in range(n_samples):
for j in range(n_samples):
kernel_matrix[i, j] = self.kernel(X[i], X[j])
# Define the quadratic optimization problem
P = cvxopt.matrix(np.outer(y, y) * kernel_matrix, tc='d')
q = cvxopt.matrix(np.ones(n_samples) * -1)
A = cvxopt.matrix(y, (1, n_samples), tc='d')
b = cvxopt.matrix(0, tc='d')
if not self.C:
G = cvxopt.matrix(np.identity(n_samples) * -1)
h = cvxopt.matrix(np.zeros(n_samples))
else:
G_max = np.identity(n_samples) * -1
G_min = np.identity(n_samples)
G = cvxopt.matrix(np.vstack((G_max, G_min)))
h_max = cvxopt.matrix(np.zeros(n_samples))
h_min = cvxopt.matrix(np.ones(n_samples) * self.C)
h = cvxopt.matrix(np.vstack((h_max, h_min)))
# Solve the quadratic optimization problem using cvxopt
minimization = cvxopt.solvers.qp(P, q, G, h, A, b)
# Lagrange multipliers
lagr_mult = np.ravel(minimization['x'])
# Extract support vectors
# Get indexes of non-zero lagr. multipiers
idx = lagr_mult > 1e-7
# Get the corresponding lagr. multipliers
self.lagr_multipliers = lagr_mult[idx]
# Get the samples that will act as support vectors
self.support_vectors = X[idx]
# Get the corresponding labels
self.support_vector_labels = y[idx]
# Calculate intercept with first support vector
self.intercept = self.support_vector_labels[0]
for i in range(len(self.lagr_multipliers)):
self.intercept -= self.lagr_multipliers[i] * self.support_vector_labels[
i] * self.kernel(self.support_vectors[i], self.support_vectors[0])
def predict(self, X):
y_pred = []
# Iterate through list of samples and make predictions
for sample in X:
prediction = 0
# Determine the label of the sample by the support vectors
for i in range(len(self.lagr_multipliers)):
prediction += self.lagr_multipliers[i] * self.support_vector_labels[
i] * self.kernel(self.support_vectors[i], sample)
prediction += self.intercept
y_pred.append(np.sign(prediction))
return np.array(y_pred)
从上面的代码可以看到,从fit出发,我们首先构造核矩阵:
# Calculate kernel matrix
kernel_matrix = np.zeros((n_samples, n_samples))
for i in range(n_samples):
for j in range(n_samples):
kernel_matrix[i, j] = self.kernel(X[i], X[j])
然后从上面的目标函数出发,我们可以看到,这是一个关于系数α的最大化问题,由于问题是 一个二次规划,那么我们可以直接求解。python包cvxopt能够求解这样的二次规划问题,二次规划的标准型为:
因此这里我们需要构造P,q,G,A,h,b,那么对于SVM的目标函数,这些矩阵的值分别是(求解的是拉格朗日系数α):
# Define the quadratic optimization problem
P = cvxopt.matrix(np.outer(y, y) * kernel_matrix, tc='d')
q = cvxopt.matrix(np.ones(n_samples) * -1)
A = cvxopt.matrix(y, (1, n_samples), tc='d')
b = cvxopt.matrix(0, tc='d')
if not self.C:
G = cvxopt.matrix(np.identity(n_samples) * -1)
h = cvxopt.matrix(np.zeros(n_samples))
else:
G_max = np.identity(n_samples) * -1
G_min = np.identity(n_samples)
G = cvxopt.matrix(np.vstack((G_max, G_min)))
h_max = cvxopt.matrix(np.zeros(n_samples))
h_min = cvxopt.matrix(np.ones(n_samples) * self.C)
h = cvxopt.matrix(np.vstack((h_max, h_min)))
可以看到和上面的目标函数的公式是一致的。 又由于根据KKT条件我们知道,对于非支持向量的α等于0,因此我们找到非零的α(也就是支持向量对应的拉格朗日系数):
# Extract support vectors
# Get indexes of non-zero lagr. multipiers
idx = lagr_mult > 1e-7
# Get the corresponding lagr. multipliers
self.lagr_multipliers = lagr_mult[idx]
# Get the samples that will act as support vectors
self.support_vectors = X[idx]
# Get the corresponding labels
self.support_vector_labels = y[idx]
由于支持向量距离分离界面的距离是1,也就是下图中徐向上蓝色点和最终的分类界面几何间隔为1,同理虚线绿色点和分类界面的几何间隔也是1,因此满足wx+b=1(蓝色),以及wx+b=-1(绿色),两式相加能够求解b。
除此之外(无需求解b),我们还可以直接和某一支持向量的函数值进行比较,因为函数值wx+b小于支持向量的都应该赋予label=-1,大于的都应该label=1,又由于SVM的线性方程可以写成这样的形式(详情见SVM):
又由于非支持向量的拉格朗日系数为0,因此我们只需要计算一个输入点x和支持向量集之间的内积就能得到最终的线性方程:
# Calculate intercept with first support vector
self.intercept = self.support_vector_labels[0]
for i in range(len(self.lagr_multipliers)):
self.intercept -= self.lagr_multipliers[i] * self.support_vector_labels[
i] * self.kernel(self.support_vectors[i], self.support_vectors[0])
def predict(self, X):
y_pred = []
# Iterate through list of samples and make predictions
for sample in X:
prediction = 0
# Determine the label of the sample by the support vectors
for i in range(len(self.lagr_multipliers)):
prediction += self.lagr_multipliers[i] * self.support_vector_labels[
i] * self.kernel(self.support_vectors[i], sample)
prediction += self.intercept
y_pred.append(np.sign(prediction))
return np.array(y_pred)