支持向量机
在本练习中,我们将使用高斯核函数的支持向量机(SVM)来构建垃圾邮件分类器。
数据集1
现在2d数据集上实验
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from scipy.io import loadmat
raw_data = loadmat('ex6data1.mat')
data=pd.DataFrame(raw_data.get('X'),columns=['X1','X2'])
data['y'] = raw_data.get('y')
data.head()
X1 | X2 | y | |
---|---|---|---|
0 | 1.9643 | 4.5957 | 1 |
1 | 2.2753 | 3.8589 | 1 |
2 | 2.9781 | 4.5651 | 1 |
3 | 2.9320 | 3.5519 | 1 |
4 | 3.5772 | 2.8560 | 1 |
#可视化
def plot_init_data(data,fig,ax):
positive = data[data['y']==1]
negative = data[data['y']==0]
ax.scatter(positive['X1'],positive['X2'],s=50,marker='o',c='r',label='positive')
ax.scatter(negative['X1'],negative['X2'],s=50,marker='x',c='b',label='negative')
fig,ax = plt.subplots(figsize=(12,8))
plot_init_data(data,fig,ax)
ax.legend()
plt.show()
可以看出左上角有一个异常点,但是整体依然呈现线性分布,所以可以调用线性支持向量机来学习类边界。
令C=1
from sklearn import svm
svc = svm.LinearSVC(C=1,loss='hinge',max_iter=1000)
svc
LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
penalty='l2', random_state=None, tol=0.0001, verbose=0)
svc.fit(data[['X1','X2']],data['y'])
svc.score(data[['X1','X2']],data['y'])
0.9803921568627451
# 可视化分类边界
def find_decision_boundary(svc,x1min,x2min,x1max,x2max,diff):
x1 = np.linspace(x1min,x1max,1000)
x2 = np.linspace(x2min,x2max,1000)
cordinates = [(x,y)