简介
本文是吴恩达《机器学习》课程习题六的python解答。习题六主题是支持向量机SVM。在练习中会在二维数据集中应用线性SVC和非线性SVC。最后会使用SVM来判断垃圾邮件。
线性SVC
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from scipy.io import loadmat
raw_data = loadmat('../data_sets/ex6data1.mat')
data = pd.DataFrame(raw_data.get('X'), columns=['X1', 'X2'])
data['y'] = raw_data['y']
data.shape
# (51, 3)
# plot original data to visualize classification
def plot_init_data(data,fig,ax):
positive = data[data['y'].isin([1])]
negetive = data[data['y'].isin([0])]
ax.scatter(positive['X1'],positive['X2'],s=50,marker="x",label='postive')
ax.scatter(negetive['X1'],negetive['X2'],s=50,marker='o',label='negetive')
fig,ax = plt.subplots(figsize=(9,6))
plot_init_data(data,fig,ax)
ax.legend()
plt.show()
from sklearn import svm
svc = svm.LinearSVC(C=1,loss='hinge')
svc.fit(data[['X1','X2']],data['y'])
svc.score(data[['X1','X2']],data['y'])
# 0.9803921568627451
# decision boundary
def find_decision_boundary(svc,x1min,x1max,x2min,x2max,diff):
# make x1 array
x1 = np.linspace(x1min,x1max,1000)
# make x2 array
x2 = np.linspace(x2min,x2max,1000)
cordinates=[(x, y) for x in x1 for y in x2]
x_cord, y_cord = zip(*cordinates)
c_val = pd.DataFrame({'X1':x_cord, 'X2':y_cord})
# use svc to make classfication
c_val['cval'] = svc.decision_function(c_val[['X1', 'X2']])
# diff should be a small value to filter points around boundary
decision = c_val[np.abs(c_val['cval']) < diff]
return decision.X1, decision.X2
x1, x2 = find_decision_boundary(svc, 0, 4, 1.5, 5, 2 * 10**-3)
fig, ax = plt.subplots(figsize=(9,6))
ax.scatter(x1, x2, s=10, c='r',label='Boundary')
plot_init_data(data, fig, ax)
ax.set_title('SVM (C=1) Decision Boundary')
ax.legend()
plt.show()
可以看到这里SVC给出了很好的结果。模型中的参数C是正则项权重的一种衡量。我们调整下参数C来看一下会有什么变化。
# C is the weight of trainning data, too small mean high bias, too high may lead to high variance
svc = svm.LinearSVC(C=10000,loss='hinge')
svc.fit(data[['X1','X2']],data['y'])
svc.score(data[['X1','X2']],data['y'])
# 1.0,overfitting
x1, x2 = find_decision_boundary(svc, 0, 4, 1.5, 5, 2 * 10**-3)
fig, ax = plt.subplots(figsize=(9,6))
ax.scatter(x1, x2, s=10, c='r',label='Boundary')
plot_init_data(data, fig, ax)
ax.set_title('SVM (C=100) Decision Boundary')
ax.legend()
plt.show()
这次模型把左上角的异常值也划分进去了,SVC的好处就是会留出margin,但是这个模型并没有合理的margin,典型的过拟合。
非线性SVC
非线性SVC需要使用到核函数,常用的是高斯核函数。
raw_data = loadmat('../data_sets/ex6data2.mat')
data = pd.DataFrame(raw_data['X'], columns=['X1', 'X2'])
data['y'] = raw_data['y']
fig, ax = plt.subplots(figsize=(12,8))
plot_init_data(data, fig, ax)
ax.legend()
plt.show()
svc = svm.SVC(C=100,kernel='rbf')
svc.fit(data[['X1','X2']],data['y'])
svc.score(data[['X1','X2']],data['y'])
# 0.9675550405561993
X1,X2 = find_decision_boundary(svc, 0, 1, 0.4, 1, 0.01)
fig,ax = plt.subplots(figsize=(12,8))
plot_init_data(data,fig,ax)
ax.scatter(X1,X2,s=5,label='decision boundary')
ax.legend()
plt.show
这里用的rbf就是高斯核函数,此外还有linear,ploy,sigmoid和precomputed核函数。详细介绍参考官网。
垃圾邮件
将垃圾邮件中是否出现特定单词作为度量来判断是否是垃圾邮件。
spam_train = loadmat('../data_sets/spamTrain.mat')
spam_test = loadmat('../data_sets/spamTest.mat')
X = spam_train['X']
Xtest = spam_test['Xtest']
y = spam_train['y'].ravel()
ytest = spam_test['ytest'].ravel()
X.shape, y.shape, Xtest.shape, ytest.shape
# ((4000, 1899), (4000,), (1000, 1899), (1000,))
svc = svm.SVC()
svc.fit(X, y)
print('Training accuracy = {0}%'.format(np.round(svc.score(X, y) * 100, 2)))
print('Test accuracy = {0}%'.format(np.round(svc.score(Xtest, ytest) * 100, 2)))
# Training accuracy = 99.32%
# Test accuracy = 98.7%
数据集
链接: https://pan.baidu.com/s/1zteJBsMJ0GRwqRb5opOgwg 提取码: 78ah