SVC类是用来进行分类的任务, SVR 类是用来进行数值回归任务的
SVM选择的核函数由参数kernel指定
线性核函数,指定参数C,表示对不符合最大间距规则的样本的惩罚力度
多项式核函数,指定参数C,degree(阶数)
高斯核函数,指定参数C,gamma
#画出分隔超平面
import numpy as np
def plot_hyperplane(clf, X, y, h=0.02, draw_sv=True, title='hyperplan'):
# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),np.arange(y_min, y_max, h))
# meshgrid():把x,y数据生成mesh网格状的数据,因为等高线的显示是在网格的基础上添加上高度值
#np.arange()函数分为一个参数,两个参数,三个参数三种情况
#1)一个参数时,参数值为终点,起点取默认值0,步长取默认值1。
#2)两个参数时,第一个参数为起点,第二个参数为终点,步长取默认值1。
#3)三个参数时,第一个参数为起点,第二个参数为终点,第三个参数为步长。其中步长支持小数
plt.title(title)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())#x轴刻度
plt.yticks(())#y轴刻度
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # SVM的分割超平面
#ravel():将多维数组转换为一维数组
#np.r_是按列连接两个矩阵,就是把两矩阵上下相加,要求列数相等。
#np.c_是按行连接两个矩阵,就是把两矩阵左右相加,要求行数相等。
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap='hot', alpha=0.5) # 填充等高线
#contour和contourf都是画三维等高线图的,不同点在于contour() 是绘制轮廓线,contourf()会填充轮廓。
markers = ['o', 's', '^']
colors = ['b', 'r', 'c']
labels = np.unique(y)
for label in labels:
plt.scatter(X[y==label][:, 0], X[y==label][:, 1], c=colors[label], marker=markers[label])
# 画出支持向量
if draw_sv:
sv = clf.support_vectors_# 获得支持向量
plt.scatter(sv[:, 0], sv[:, 1], c='y', marker='x')
先用线性核函数:
from sklearn import svm
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']#显示中文
X,y=make_blobs(n_samples=100,centers=2,random_state=0,cluster_std=0.3)
#生成数据集
#n_features表示每一个样本有多少特征值
#n_samples表示样本的个数
#centers是聚类中心点的个数,可以理解为label的种类数
#random_state是随机种子,可以固定生成的数据
#cluster_std设置每个类别的方差
clf=svm.SVC(C=1.0,kernel='linear')
clf.fit(X,y)
plt.figure(figsize=(12,4),dpi=144)
plot_hyperplane(clf,X,y,h=0.1,title="分隔超平面分类算法")
比较线性核函数、多项式核函数、高斯核函数
from sklearn import svm
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
X,y=make_blobs(n_samples=100,centers=3,random_state=0,cluster_std=0.8)
clf_linear=svm.SVC(C=1.0,kernel='linear')
clf_poly=svm.SVC(C=1.0,kernel='poly',degree=3)
clf_rbf=svm.SVC(C=1.0,kernel='rbf',gamma=0.5)
clf_rbf2=svm.SVC(C=1.0,kernel='rbf',gamma=0.1)
plt.figure(figsize=(10,10),dpi=144)
clfs=[clf_linear,clf_poly,clf_rbf,clf_rbf2]
titles=['Linear Kernel','Polynomial Kernel with Degree=3','Gaussian Kernel with gamma=0.5','Gaussian Kernel with gamma=0.1']
for clf,i in zip(clfs,range(len(clfs))):
clf.fit(X,y)
plt.subplot(2,2,i+1)
plot_hyperplane(clf,X,y,h=0.1,title=titles[i])
带x标记的点是支持向量,为什么离分割超平面最近的点是支持向量,离很远的点也是支持向量呢?
因为高斯核函数把输入特征向量映射到无限维的向量空间里,在高维空间中,他们是支持向量
正题——预测乳腺癌
from sklearn import svm
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
cancer=load_breast_cancer()
X=cancer.data
y=cancer.target
print("data shape:{0},no. positive:{1},no. negative:{2}".format(X.shape,y[y==1].shape,y[y==0].shape))
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
猜测:数据集小,高斯核函数效果应该不是很好,验证如下:
from sklearn.svm import SVC
clf_rbf_cancer=SVC(C=1.0,kernel='rbf',gamma=0.1)
clf_rbf_cancer.fit(X_train,y_train)
rbf_cancer_train_score=clf_rbf.score(X_train,y_train)
rbf_cancer_test_score=clf_rbf.score(X_test,y_test)
print('rbf : train_score:{0},test_score:{1}'.format(rbf_cancer_train_score,rbf_cancer_test_score))
网格搜索最优参数:
from sklearn.model_selection import GridSearchCV
import numpy as np
gammas=np.linspace(0,0.0003,30)
param_grid={'gamma':gammas}
grid_clf=GridSearchCV(SVC(),param_grid=param_grid,cv=5,return_train_score=True)
grid_clf.fit(X,y)
print("best param:{0} best score:{1}".format(grid_clf.best_params_,grid_clf.best_score_))
画出不同参数对应的得分:
def plot_curve(train_sizes, cv_results, xlabel):
train_scores_mean = cv_results['mean_train_score']
train_scores_std = cv_results['std_train_score']
test_scores_mean = cv_results['mean_test_score']
test_scores_std = cv_results['std_test_score']
plt.figure(figsize=(10, 6), dpi=144)
plt.title('parameters turning')
plt.grid()
plt.xlabel(xlabel)
plt.ylabel('score')
plt.fill_between(train_sizes,
train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std,
alpha=0.1, color="r")
plt.fill_between(train_sizes,
test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std,
alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, '.--', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, '.-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
plot_curve(gammas,grid_clf.cv_results_,xlabel='rbf gammas')
#plt.fill_between(x, 0, y, facecolor='green', alpha=0.3)
#x:第一个参数表示覆盖的区域,我直接复制为x,表示整个x都覆盖
#0:表示覆盖的下限
#y:表示覆盖的上限是y这个曲线
#facecolor:覆盖区域的颜色
#alpha:覆盖区域的透明度[0,1],其值越大,表示越不透明
画出高斯核函数,gamma=0.01的学习曲线
#画学习曲线
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
plt.rcParams['font.sans-serif'] = ['SimHei']#显示中文
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()# 生成网格
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",label="Cross-validation score")
plt.legend(loc="best")#添加图例
return plt
from sklearn.model_selection import ShuffleSplit
cv=ShuffleSplit(n_splits=10,test_size=0.2,random_state=0)#10折
plot_learning_curve(SVC(C=1.0,kernel='rbf',gamma=0.01),"rbf 学习曲线",X,y,cv=cv)
高斯核函数效果不好,尝试用多项式核函数:
#二阶多项式核函数
from sklearn.svm import SVC
clf_poly2=SVC(C=1.0,kernel='poly',degree=2)
clf_poly2.fit(X_train,y_train)
poly2_train_score=clf_poly2.score(X_train,y_train)
poly2_test_score=clf_poly2.score(X_test,y_test)
print('poly : train_score:{0},test_score:{1}'.format(poly2_train_score,poly2_test_score))