SVM向量机——预测乳腺癌

 

SVC类是用来进行分类的任务, SVR 类是用来进行数值回归任务的

SVM选择的核函数由参数kernel指定
线性核函数,指定参数C,表示对不符合最大间距规则的样本的惩罚力度
多项式核函数,指定参数C,degree(阶数)
高斯核函数,指定参数C,gamma
 

#画出分隔超平面
import numpy as np
def plot_hyperplane(clf, X, y, h=0.02, draw_sv=True, title='hyperplan'):
    # create a mesh to plot in
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),np.arange(y_min, y_max, h))
    # meshgrid():把x,y数据生成mesh网格状的数据,因为等高线的显示是在网格的基础上添加上高度值

    #np.arange()函数分为一个参数,两个参数,三个参数三种情况
    #1)一个参数时,参数值为终点,起点取默认值0,步长取默认值1。
    #2)两个参数时,第一个参数为起点,第二个参数为终点,步长取默认值1。
    #3)三个参数时,第一个参数为起点,第二个参数为终点,第三个参数为步长。其中步长支持小数

    plt.title(title)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.xticks(())#x轴刻度
    plt.yticks(())#y轴刻度

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # SVM的分割超平面   
    #ravel():将多维数组转换为一维数组
    #np.r_是按列连接两个矩阵,就是把两矩阵上下相加,要求列数相等。
    #np.c_是按行连接两个矩阵,就是把两矩阵左右相加,要求行数相等。
    
    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.contourf(xx, yy, Z, cmap='hot', alpha=0.5) # 填充等高线
    #contour和contourf都是画三维等高线图的,不同点在于contour() 是绘制轮廓线,contourf()会填充轮廓。
    
    markers = ['o', 's', '^']
    colors = ['b', 'r', 'c']
    labels = np.unique(y)
    for label in labels:
        plt.scatter(X[y==label][:, 0], X[y==label][:, 1], c=colors[label], marker=markers[label])
    # 画出支持向量
    if draw_sv:
        sv = clf.support_vectors_# 获得支持向量
        plt.scatter(sv[:, 0], sv[:, 1], c='y', marker='x')

先用线性核函数: 

from sklearn import svm
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif'] = ['SimHei']#显示中文

X,y=make_blobs(n_samples=100,centers=2,random_state=0,cluster_std=0.3)
#生成数据集  
#n_features表示每一个样本有多少特征值
#n_samples表示样本的个数
#centers是聚类中心点的个数,可以理解为label的种类数
#random_state是随机种子,可以固定生成的数据
#cluster_std设置每个类别的方差

clf=svm.SVC(C=1.0,kernel='linear')
clf.fit(X,y)
plt.figure(figsize=(12,4),dpi=144)
plot_hyperplane(clf,X,y,h=0.1,title="分隔超平面分类算法")

 比较线性核函数、多项式核函数、高斯核函数

from sklearn import svm
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt


X,y=make_blobs(n_samples=100,centers=3,random_state=0,cluster_std=0.8)
clf_linear=svm.SVC(C=1.0,kernel='linear')
clf_poly=svm.SVC(C=1.0,kernel='poly',degree=3)
clf_rbf=svm.SVC(C=1.0,kernel='rbf',gamma=0.5)
clf_rbf2=svm.SVC(C=1.0,kernel='rbf',gamma=0.1)

plt.figure(figsize=(10,10),dpi=144)

clfs=[clf_linear,clf_poly,clf_rbf,clf_rbf2]
titles=['Linear Kernel','Polynomial Kernel with Degree=3','Gaussian Kernel with gamma=0.5','Gaussian Kernel with gamma=0.1']

for clf,i in zip(clfs,range(len(clfs))):
    clf.fit(X,y)
    plt.subplot(2,2,i+1)
    plot_hyperplane(clf,X,y,h=0.1,title=titles[i])             

带x标记的点是支持向量,为什么离分割超平面最近的点是支持向量,离很远的点也是支持向量呢?

因为高斯核函数把输入特征向量映射到无限维的向量空间里,在高维空间中,他们是支持向量

正题——预测乳腺癌

from sklearn import svm
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

cancer=load_breast_cancer()
X=cancer.data
y=cancer.target
print("data shape:{0},no. positive:{1},no. negative:{2}".format(X.shape,y[y==1].shape,y[y==0].shape))

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

 猜测:数据集小,高斯核函数效果应该不是很好,验证如下:

from sklearn.svm import SVC
clf_rbf_cancer=SVC(C=1.0,kernel='rbf',gamma=0.1)
clf_rbf_cancer.fit(X_train,y_train)
rbf_cancer_train_score=clf_rbf.score(X_train,y_train)
rbf_cancer_test_score=clf_rbf.score(X_test,y_test)
print('rbf : train_score:{0},test_score:{1}'.format(rbf_cancer_train_score,rbf_cancer_test_score))

 网格搜索最优参数:

from sklearn.model_selection import GridSearchCV
import numpy as np

gammas=np.linspace(0,0.0003,30)
param_grid={'gamma':gammas}
grid_clf=GridSearchCV(SVC(),param_grid=param_grid,cv=5,return_train_score=True)
grid_clf.fit(X,y)
print("best param:{0} best score:{1}".format(grid_clf.best_params_,grid_clf.best_score_))

 画出不同参数对应的得分:

def plot_curve(train_sizes, cv_results, xlabel):
    train_scores_mean = cv_results['mean_train_score']
    train_scores_std = cv_results['std_train_score']
    test_scores_mean = cv_results['mean_test_score']
    test_scores_std = cv_results['std_test_score']
    plt.figure(figsize=(10, 6), dpi=144)
    plt.title('parameters turning')
    plt.grid()
    plt.xlabel(xlabel)
    plt.ylabel('score')
    plt.fill_between(train_sizes, 
                     train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, 
                     alpha=0.1, color="r")
    plt.fill_between(train_sizes, 
                     test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, 
                     alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, '.--', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, '.-', color="g",
             label="Cross-validation score") 
    plt.legend(loc="best")

plot_curve(gammas,grid_clf.cv_results_,xlabel='rbf gammas')

    #plt.fill_between(x, 0, y, facecolor='green', alpha=0.3)
    #x:第一个参数表示覆盖的区域,我直接复制为x,表示整个x都覆盖
    #0:表示覆盖的下限
    #y:表示覆盖的上限是y这个曲线
    #facecolor:覆盖区域的颜色
    #alpha:覆盖区域的透明度[0,1],其值越大,表示越不透明

 画出高斯核函数,gamma=0.01的学习曲线

#画学习曲线
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve

plt.rcParams['font.sans-serif'] = ['SimHei']#显示中文

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()# 生成网格
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",label="Cross-validation score") 
    plt.legend(loc="best")#添加图例
    return plt

from sklearn.model_selection import ShuffleSplit
cv=ShuffleSplit(n_splits=10,test_size=0.2,random_state=0)#10折

plot_learning_curve(SVC(C=1.0,kernel='rbf',gamma=0.01),"rbf 学习曲线",X,y,cv=cv)

高斯核函数效果不好,尝试用多项式核函数:

#二阶多项式核函数
from sklearn.svm import SVC
clf_poly2=SVC(C=1.0,kernel='poly',degree=2)
clf_poly2.fit(X_train,y_train)
poly2_train_score=clf_poly2.score(X_train,y_train)
poly2_test_score=clf_poly2.score(X_test,y_test)
print('poly : train_score:{0},test_score:{1}'.format(poly2_train_score,poly2_test_score))

 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值