1.导入相应包
from sklearn.datasets import load_breast_cancer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from time import time
import datetime
import pandas as pd
2.准备数据集并可视化
data = load_breast_cancer()
X = data.data
y = data.target
X.shape
可视化散点图
plt.scatter(X[:,0],X[:,1],c=y)
plt.show()
这里不是因为特征是二维矩阵,而是只选择了前两特征在平面中进行可视化。
3.选择不同模型进行训练
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,y,test_size=0.3,random_state=420)
Kernel = ["linear", 'poly',"rbf","sigmoid"]
for kernel in Kernel:
time0 = time()
clf= SVC(kernel = kernel
, gamma="auto"
, degree = 1
, cache_size=5000
).fit(Xtrain,Ytrain)
print("The accuracy under kernel %s is %f" % (kernel,clf.score(Xtest,Ytest)))
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
最终输出如下:
The accuracy under kernel linear is 0.929825
00:00:600904
The accuracy under kernel poly is 0.923977
00:00:110731
The accuracy under kernel rbf is 0.596491
00:00:058461
The accuracy under kernel sigmoid is 0.596491
00:00:009973
由结果可以看出线性核和多项式核时精度较好,且时间消耗较高。
4.进一步观察数据集
癌症数据集是典型的线性数据集,且是未去量纲的数据,这里我们利用标准差去量纲,然后分别进行训练。
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(X)
data = pd.DataFrame(X)
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,y,test_size=0.3,random_state=420)
Kernel = ["linear","poly","rbf","sigmoid"]
for kernel in Kernel:
time0 = time()
clf= SVC(kernel = kernel
, gamma="auto"
, degree = 1
, cache_size=5000
).fit(Xtrain,Ytrain)
print("The accuracy under kernel %s is %f" % (kernel,clf.score(Xtest,Ytest)))
print(datetime.datetime.fromtimestamp(time()-time0).strftime("%M:%S:%f"))
输出结果如下:
The accuracy under kernel linear is 0.976608
00:00:018951
The accuracy under kernel poly is 0.964912
00:00:005982
The accuracy under kernel rbf is 0.970760
00:00:014959
The accuracy under kernel sigmoid is 0.953216
00:00:007980
这里我们的时间和精确度都有很大的提升,说明两点:
1.线性核,尤其是多项式核函数在高次项时计算非常缓慢
2.rbf和多项式核函数都不擅长处理量纲不统一的数据集
5.关于gamma的选取
score = []
gamma_range = np.logspace(-10, 1, 50)
for i in gamma_range:
clf = SVC(kernel='rbf', gamma=i, cache_size=5000).fit(Xtrain, Ytrain)
score.append(clf.score(Xtest, Ytest))
print(max(score), gamma_range[score.index(max(score))])
plt.plot(gamma_range, score)
plt.show()
绘制图像如下:
6.通过网格搜索搜索多项式核最佳参数
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
time0 = time()
gamma_range = np.logspace(-10, 1, 20)
coef0_range = np.linspace(0, 5, 10)
param_grid = dict(gamma=gamma_range, coef0=coef0_range)
#设置数据集分离器
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=420)
grid = GridSearchCV(SVC(kernel='poly', degree=1, cache_size=5000), param_grid=param_grid, cv=cv)
grid.fit(X, y)
print("The best parameters are %s with a score of %0.5f" % (grid.best_params_, grid.best_score_))
print(datetime.datetime.fromtimestamp(time() - time0).strftime("%M:%S:%f"))
输出为:
The best parameters are {‘coef0’: 0.0, ‘gamma’: 0.18329807108324375} with a score of 0.96959
00:10:113562
7.重要参数C
C越大越容易过拟合,C越小越容易欠拟合,如何选取可以通过验证集。
#线性核
score = []
C_range = np.linspace(0.01, 30, 50)
for i in C_range:
clf = SVC(kernel='linear', C=i, cache_size=500).fit(Xtrain, Ytrain)
score.append(clf.score(Xtest, Ytest))
print(max(score), C_range[score.index(max(score))])
plt.plot(C_range, score)
plt.savefig(r"C:\Users\86377\Desktop\3.png")
plt.show()
绘制图像如下: