Python 3.7
所用数据集链接:所用数据集链接(ex6data1.mat,ex6data2.mat,ex6data3.mat),提取码:c3yy
Support Vector Machine
题目:本次将练习SVM算法的使用。所用的数据集一共有三个,其中一个是线性核数据集,另外两个是高斯核数据集。通过练习,希望读者可以学会运用SVM算法解决实际问题并对其有更加深刻的理解。
Linear kernel
首先是线性核的SVM算法
1.0 Pcakage
引入相应包:
import numpy as np
# 矩阵处理
import matplotlib.pyplot as plt
# 绘图
import pandas as pd
# 数据处理
from scipy.io import loadmat
# 读取矩阵文件
from sklearn import svm
# 引入SVM模型
1.1 Load data
读取数据:
def load_data(path):
data=loadmat(path)
x=data['X']
y=data['y']
# 根据索引区分x,y
return data,x,y
data,x,y=load_data('ex6data1.mat')
print(x.shape) #(51,2)
print(y.shape) #(51,1)
1.2 Visualization data
可视化数据:
def view_data(x,y):
fig,ax=plt.subplots(figsize=(6,6))
ax.scatter(x[:,0],x[:,1],c=y.flatten(),cmap='rainbow')
# 散点颜色可以根据数据的不同标签(0或1)区分
ax.set_xlabel('x1')
ax.set_ylabel('x2')
ax.set_title('The train examples')
plt.show()
view_data(x,y)
输出如下:
1.3 Train model
下面训练模型:
def train_model():
clf1=svm.SVC(1,kernel='linear')
clf2=svm.SVC(100,kernel='linear')
# 创建两个分类器clf1和clf2,并且分别设置容忍因子C为1和100
model1=clf1.fit(x,y.flatten())
model2=clf2.fit(x,y.flatten())
# 对两个分类器进行训练得到两个模型model1和model2
return clf1,clf2,model1,model2
clf1,clf2,model1,model2=train_model()
1.4 Decision boundary
训练完后我们需要确定决策边界:
def decision_boundary(model,x,c):
x1_min=min(x[:,0])*1.2
x1_max=max(x[:,0])*1.2
x2_min=min(x[:,1])*1.2
x2_max=max(x[:,1])*1.2
x1,x2=np.meshgrid(np.linspace(x1_min,x1_max,200),np.linspace(x2_min,x2_max,200))
# 创建网格
z=model.predict(np.c_[x1.flatten(),x2.flatten()])
# 利用模型进行预测,传入的应为二维数组,np.c_表示按行拼接
z=z.reshape(x1.shape)
plt.contour(x1,x2,z,colors=c)
# 绘制等高线
1.5 Visualization result
可视化结果:
def view_result(x,y):
fig,ax=plt.subplots(figsize=(6,6))
ax.scatter(x[:,0],x[:,1],c=y.flatten(),cmap='rainbow')
plt.title('SVM with C=1(orange) and 100(black)')
decision_boundary(model1,x,c='orange')
decision_boundary(model2,x,c='black')
# 分别画出两个模型的决策边界,用颜色区分
ax.set_xlabel('x1')
ax.set_ylabel('x2')
plt.show()
view_result(x,y)
输出如下:
可以看出,C较小时,对于错误容忍度较高,此时Bias较高,Variance较低。当C较大时,对于错误容忍度较低,此时Bias较低,Variance较高,任意过拟合,同时模型鲁棒性较差。
Nonlinear kernel
非线性核
2.0 Package
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat
from sklearn import svm
2.1 Load data
data2,x2,y2=load_data('ex6data2.mat')
2.2 View data
view_data(x2,y2)
输出如下:
2.3 Train model
显然这里需要采用高斯核进行训练:
def train_nonlinear_model():
sigma=0.1
# sigma决定核函数的平滑度,从而影响bias 和 variance
gamma=np.power(sigma,-2)/2
clf=svm.SVC(C=1,gamma=gamma)
model=clf.fit(x2,y2.flatten())
return clf,model
clf,model=train_nonlinear_model()
2.4 Decision boundary
决策边界:
decision_boundary(model,x2,c='black')
2.5 Visualization result
可视化结果:
fig,ax=plt.subplots(figsize=(6,6))
ax.scatter(x2[:,0],x2[:,1],c=y2.flatten(),cmap='rainbow')
decision_boundary(model,x2,c='black')
plt.show()
输出如下:
效果非常不错。
Another example
再来看一个例子,其中仍然是非线性核,但是数据集中增加了交叉验证集,可以对模型进行评估。
3.0 Package
# 所用到的库和前面相同
3.1 Load data
data3,_,_=load_data('ex6data3.mat')
x3,y3=data3['X'],data3['y']
x_cv,y_cv=data3['Xval'],data3['yval']
# 交叉验证集
3.2 Visualization data
view_data(x3,y3)
输出如下:
3.3 Train model
clist=[0.01,0.03,0.1,0.3,1,3,10,30,50]
sigmalist=clist
# 待选容忍因子和sigma
best_pair,best_score=(0,0),0
def train_nonlinear_model2():
global best_pair
global best_score
for c in clist:
for sigma in sigmalist:
gamma=np.power(sigma,-2.)/2
clf=svm.SVC(C=c,kernel='rbf',gamma=gamma)
model=clf.fit(x3,y3.flatten())
# 针对不同的C和sigmma训练模型
score=model.score(x_cv,y_cv)
if score > best_score:
best_score=score
best_pair=(c,sigma)
# 找到最高分模型及对应的C和sigma,注意这里是用交叉验证集进行评估
return best_score,best_pair
best_score,best_pair=train_nonlinear_model2()
print('best_pair={},best_score={}'.format(best_pair,best_score))
输出如下:
3.4 Decision boundary
decision_boundary(model,x3,c='black')
3.5 Visualization result
clf=svm.SVC(C=best_pair[0],kernel='rbf',gamma=np.power(best_pair[1],-2.)/2)
model=clf.fit(x3,y3.flatten())
fig,ax=plt.subplots(figsize=(6,6))
ax.scatter(x3[:,0],x3[:,1],c=y3.flatten(),cmap='rainbow')
decision_boundary(model,x3,c='black')
plt.show()
输出如下:
All
给出完整代码:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.io import loadmat
from sklearn import svm
import seaborn as sn
def load_data(path):
data=loadmat(path)
x=data['X']
y=data['y']
return data,x,y
data,x,y=load_data('ex6data1.mat')
print(x.shape)
print(y.shape)
def view_data(x,y):
fig,ax=plt.subplots(figsize=(6,6))
ax.scatter(x[:,0],x[:,1],c=y.flatten(),cmap='rainbow')
ax.set_xlabel('x1')
ax.set_ylabel('x2')
ax.set_title('The train examples')
plt.show()
#view_data(x,y)
def train_model():
clf1=svm.SVC(1,kernel='linear')
clf2=svm.SVC(100,kernel='linear')
model1=clf1.fit(x,y.flatten())
model2=clf2.fit(x,y.flatten())
return clf1,clf2,model1,model2
clf1,clf2,model1,model2=train_model()
def decision_boundary(model,x,c):
x1_min=min(x[:,0])
x1_max=max(x[:,0])*1.2
x2_min=min(x[:,1])
x2_max=max(x[:,1])*1.2
x1,x2=np.meshgrid(np.linspace(x1_min,x1_max,200),np.linspace(x2_min,x2_max,200))
z=model.predict(np.c_[x1.flatten(),x2.flatten()])
z=z.reshape(x1.shape)
plt.contour(x1,x2,z,colors=c)
def view_result(x,y):
fig,ax=plt.subplots(figsize=(6,6))
ax.scatter(x[:,0],x[:,1],c=y.flatten(),cmap='rainbow')
plt.title('SVM with C=1(orange) and 100(black)')
decision_boundary(model1,x,c='orange')
decision_boundary(model2,x,c='black')
ax.set_xlabel('x1')
ax.set_ylabel('x2')
plt.show()
#view_result(x,y)
data2,x2,y2=load_data('ex6data2.mat')
#view_data(x2,y2)
#def gausskernel(x1,x2,sigma):
# res=np.exp(-sum((x1-x2)**2)/(2*sigma**2))
# return res
#自定义高斯核
#print(gausskernel(np.array([1,2,1]),np.array([0,4,-1]),2))
def train_nonlinear_model():
sigma=0.1
gamma=np.power(sigma,-2)/2
clf=svm.SVC(C=1,gamma=gamma)
model=clf.fit(x2,y2.flatten())
return clf,model
clf,model=train_nonlinear_model()
fig,ax=plt.subplots(figsize=(6,6))
ax.scatter(x2[:,0],x2[:,1],c=y2.flatten(),cmap='rainbow')
decision_boundary(model,x2,c='black')
ax.set_xlabel('x1')
ax.set_ylabel('x2')
ax.set_title('SVM')
plt.show()
data3,_,_=load_data('ex6data3.mat')
x3,y3=data3['X'],data3['y']
x_cv,y_cv=data3['Xval'],data3['yval']
#view_data(x3,y3)
clist=[0.01,0.03,0.1,0.3,1,3,10,30,50]
sigmalist=clist
best_pair,best_score=(0,0),0
def train_nonlinear_model2():
global best_pair
global best_score
for c in clist:
for sigma in sigmalist:
gamma=np.power(sigma,-2.)/2
clf=svm.SVC(C=c,kernel='rbf',gamma=gamma)
model=clf.fit(x3,y3.flatten())
score=model.score(x_cv,y_cv)
if score > best_score:
best_score=score
best_pair=(c,sigma)
return best_score,best_pair
best_score,best_pair=train_nonlinear_model2()
print('best_pair={},best_score={}'.format(best_pair,best_score))
clf=svm.SVC(C=best_pair[0],kernel='rbf',gamma=np.power(best_pair[1],-2.)/2)
model=clf.fit(x3,y3.flatten())
fig,ax=plt.subplots(figsize=(6,6))
ax.scatter(x3[:,0],x3[:,1],c=y3.flatten(),cmap='rainbow')
decision_boundary(model,x3,c='black')
plt.show()
多加练习。
未经允许,请勿转载。
欢迎交流。