吴恩达机器学习(6)SVM

一、线性SVM

1 导入数据

(1)导入库

import numpy as np
import scipy.io as sio
import matplotlib.pyplot as plt

(2)导入数据

data = sio.loadmat('./data/ex6data1.mat')

data.keys()

X,y = data['X'],data['y']
X.shape,y.shape

(3)可视化

def plot_data():
    plt.scatter(X[:,0],X[:,1],c = y.flatten(), cmap ='jet')
    plt.xlabel('x1')
    plt.ylabel('y1')

plot_data()

2 SVM模型

(1)导入库

from sklearn.svm import SVC

(2)构建模型

svc1 = SVC(C=1,kernel='linear')#实例化分类器,C为误差项惩罚系数,核函数选择线性核
svc1.fit(X,y.flatten())

(3)预测

svc1.predict(X)

svc1.score(X,y.flatten())

(4)分界函数

def plot_boundary(model):
    x_min,x_max = -0.5,4.5
    y_min,y_max = 1.3,5
    xx,yy = np.meshgrid(np.linspace(x_min,x_max,500),
                       np.linspace(y_min,y_max,500))
    z = model.predict(np.c_[xx.flatten(),yy.flatten()])
    
    zz = z.reshape(xx.shape)
    plt.contour(xx,yy,zz)

plot_boundary(svc1)
plot_data()

(5)调整误差项惩罚系数

svc100 = SVC(C=100,kernel='linear')
svc100.fit(X,y.flatten())

svc100.predict(X)

svc100.score(X,y.flatten())

plot_boundary(svc100)
plot_data()

误差项惩罚系数C越大,容错率越低,越易过拟合。

二、非线性SVM

1 导入数据

(1)导入库

import numpy as np
import scipy.io as sio
import matplotlib.pyplot as plt
from sklearn.svm import SVC

(2)导入数据

data = sio.loadmat('./data/ex6data2.mat')

data.keys()

X,y = data['X'],data['y']
X.shape,y.shape

(3)可视化

def plot_data():
    plt.scatter(X[:,0],X[:,1],c = y.flatten(), cmap ='jet')
    plt.xlabel('x1')
    plt.ylabel('y1')

plot_data()

2 SVM模型

(1)构建模型

svc1 = SVC(C=1,kernel='rbf',gamma=1000) #实例化分类器,C为误差项惩罚系数,核函数选择高斯
svc1.fit(X,y.flatten())

(2)预测准确率

svc1.score(X,y.flatten())

(3)分界函数

def plot_boundary(model):
    x_min,x_max = 0,1
    y_min,y_max = 0.4,1
    xx,yy = np.meshgrid(np.linspace(x_min,x_max,500),
                       np.linspace(y_min,y_max,500))
    z = model.predict(np.c_[xx.flatten(),yy.flatten()])
    
    zz = z.reshape(xx.shape)
    plt.contour(xx,yy,zz)

plot_boundary(svc1)
plot_data()

σ 值越大,模型复杂度越高,同时也越易过拟合;
σ 值越小,模型复杂度越低,同时也越易欠拟合。

三、寻找最优参数 C 和 σ

1 导入数据

(1)导入库

import numpy as np
import scipy.io as sio
import matplotlib.pyplot as plt
from sklearn.svm import SVC

(2)导入数据

mat = sio.loadmat('data/ex6data3.mat')
X, y = mat['X'], mat['y']
Xval, yval = mat['Xval'], mat['yval']

(3)可视化

def plot_data():
    plt.scatter(X[:,0],X[:,1],c = y.flatten(), cmap ='jet')
    plt.xlabel('x1')
    plt.ylabel('y1')

 plot_data()

2 寻优过程

(1)待选参数

Cvalues = [3, 10, 30, 100,0.01, 0.03, 0.1, 0.3,1 ]  #9
gammas =  [1 ,3, 10, 30, 100,0.01, 0.03, 0.1, 0.3]  #9

(2)参数寻优

best_score = 0
best_params = (0,0)

for c in Cvalues:
    for gamma in gammas:
        svc = SVC(C=c,kernel='rbf',gamma=gamma)
        svc.fit(X,y.flatten())
        score = svc.score(Xval,yval.flatten())
        if score > best_score:
            best_score = score
            best_params = (c,gamma)
print(best_score,best_params)

(3)模型构建

svc2 = SVC(C=0.3,kernel='rbf',gamma=100)
svc2.fit(X,y.flatten())

(4)分界函数

def plot_boundary(model):
    x_min,x_max = -0.6,0.4
    y_min,y_max = -0.7,0.6
    xx,yy = np.meshgrid(np.linspace(x_min,x_max,500),
                       np.linspace(y_min,y_max,500))
    z = model.predict(np.c_[xx.flatten(),yy.flatten()])
    
    zz = z.reshape(xx.shape)
    plt.contour(xx,yy,zz)

plot_boundary(svc2)
plot_data()

四、例题:垃圾邮件识别

1 数据预处理

(1)导入数据

with open('data/emailSample1.txt', 'r') as f:
    sampe_email = f.read()
    print(sampe_email)

(2)数据预处理函数定义

'''
预处理主要包括以下8个部分:
  1. 将大小写统一成小写字母;
  2. 移除所有HTML标签,只保留内容。
  3. 将所有的网址替换为字符串 “httpaddr”.
  4. 将所有的邮箱地址替换为 “emailaddr”
  5. 将所有dollar符号($)替换为“dollar”.
  6. 将所有数字替换为“number”
  7. 将所有单词还原为词源,词干提取
  8. 移除所有非文字类型
  9.去除空字符串‘’
'''

import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat
from sklearn import svm
import nltk.stem as ns
import re

def preprocessing(email):
    
    # 1. 统一成小写
    email = email.lower()
    
    #2. 去除html标签
    email = re.sub('<[^<>]>', ' ', email)
    
    #3. 将网址替换为字符串 “httpaddr”.
    email = re.sub('(http|https)://[^\s]*', 'httpaddr', email ) 
    
    #4. 将邮箱地址替换为 “emailaddr”
    email = re.sub('[^\s]+@[^\s]+', 'emailaddr', email)
    
     # 5.所有dollar符号($)替换为“dollar”.
    email = re.sub('[\$]+', 'dollar', email) 
    
    # 6.匹配数字,将数字替换为“number”
    email = re.sub('[0-9]+', 'number', email) # 匹配一个数字, 相当于 [0-9],+ 匹配1到多次
    
    # 7. 词干提取
    tokens = re.split('[ \@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\(\)\{\}\,\'\"\>\_\<\;\%]', email)
    tokenlist=[]

    s = ns.SnowballStemmer('english')
        
    for token in tokens:
        
        # 8. 移除非文字类型
        email  = re.sub('[^a-zA-Z0-9]', '', email)
        stemmed = s.stem(token)
    
        # 9.去除空字符串‘’
        if not len(token): continue
        tokenlist.append(stemmed)  
        
    return tokenlist

(3)实例化

email = preprocessing(sampe_email)

(3)单词索引

def email2VocabIndices(email, vocab):
    """提取存在单词的索引"""
    token = preprocessing(email)
    print(token)
    index = [i for i in range(len(token)) if token[i] in vocab]
    return index

(4)词向量函数定义

def email2FeatureVector(email):
    """
    将email转化为词向量,n是vocab的长度。存在单词的相应位置的值置为1,其余为0
    """
    df = pd.read_table('data/vocab.txt',names=['words'])
    vocab = df.values  # return array
    vector = np.zeros(len(vocab))  # init vector
    vocab_indices = email2VocabIndices(email, vocab) 
    print(vocab_indices)# 返回含有单词的索引
    # 将有单词的索引置为1
    for i in vocab_indices:
        vector[i] = 1
    return vector

(5)实例化

import pandas as pd
vector = email2FeatureVector(sampe_email)
print('length of vector = {}\nnum of non-zero = {}'.format(len(vector), int(vector.sum())))

vector.shape

2 垃圾邮件识别

(1)导入库

import scipy.io as sio
from sklearn.svm import SVC

(2)导入数据

# training data
data1 = sio.loadmat('data/spamTrain.mat')
X, y = data1['X'], data1['y']
 
# Testing data
data2 = sio.loadmat('data/spamTest.mat')
Xtest, ytest = data2['Xtest'], data2['ytest']

(3)参数寻优

Cvalues = [3, 10, 30, 100,0.01, 0.03, 0.1, 0.3,1 ] 

best_score = 0
best_param = 0

for c in Cvalues:
    svc = SVC(C=c,kernel='linear')
    svc.fit(X,y.flatten())
    score= svc.score(Xtest,ytest.flatten())
    if score > best_score:
        best_score = score
        best_param = c
print(best_score,best_param)

(4)模型构建

svc = SVC(0.03,kernel='linear')
svc.fit(X,y.flatten())
score_train= svc.score(X,y.flatten())
score_test= svc.score(Xtest,ytest.flatten())

(5)预测结果

print(score_train,score_test)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值