支持向量机实践
支持向量机调参示例
1.导入支持向量机
from sklearn.svm import SVC
SVC(
C = 1.0,
kernel = 'rbf', #径向基核函数
degree = 3,
gamma = 'auto_deprecated',
coef0 = 0.0,
shrinking = True,
proobability = False, #概率,返回这条数据属于哪一个类别的概率
tol = 0.001,
cache_size = 200,
class_weight = None,
verbose = False,
max_iter = -1,
decision_function_shape = 'ovr',
random_state = None
)
2.生成训练测试所用训练集
2.1生成数据集
#导入numpy库与绘图库
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
#生成100个具有两个特征的两分类点
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=100,
n_features = 2,
centers = 2,
random_state = 6
)
plt.scatter(X[:, 0], X[:, 1], s=120, c=y, cmap = plt.cm.spring, edgecolors = 'k')
运行结果如下:
2.2拆分训练集与测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 6)
3模型导入与训练
from sklearn.svm import SVC
#实例化
clf = SVC(kernel = 'linear', C = 1000)
#模型训练
clf.fit(X_train, y_train)
#查看模型准确率
clf.score(X_test, y_test)
#查看支持向量个数(每类分别由多少个)
clf.n_support_
#查看支持向量的坐标
clf.support_vectors_
4结果可视化
plt.scatter(X_train[:, 0], X_train[:, 1], s = 120,
c = y_train, #按照标签着色
cmap = plt.cm.spring,
edgecolors = 'k'
)
ax = plt.gca()
xlim = ax.get_xlim()
ylim = ax.get_ylim()
xx = np.linspace(xlim[0], xlim[1], 30) #在两个点中间等间隔的取30个点
yy = np.linspace(ylim[0], ylim[1], 30)
YY, XX = np.meshgrid(yy,xx)
xy = np.vstack([XX.ravel(), YY.ravel()]).T #垂直的堆
Z = clf.decision_function(xy).reshape(XX.shape)#把每个数据点属于每个类别的分数打出来
#把分类的决定边界画出来
ax.contour(XX, YY, Z, colors = 'k', levels = [-1, 0, 1], alpha = 0.5,
linestyles=['--', '-', '--']) #虚线,实线, 虚线
ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s = 100,
linewidth = 1, facecolors = 'none')
之后陆续调整参数观察图中所显示的情况:
C= 0.1
:
clf = SVC(kernel = 'linear', C = 0.1) #改变C之后, 限制变弱了
#模型训练
clf.fit(X_train, y_train)
#结果可视化
plt.scatter(X_train[:, 0], X_train[:, 1], s = 120,
c = y_train, #按照标签着色
cmap = plt.cm.spring,
edgecolors = 'k'
)
ax = plt.gca()
xlim = ax.get_xlim()
ylim = ax.get_ylim()
xx = np.linspace(xlim[0], xlim[1], 30) #在两个点中间等间隔的取30个点
yy = np.linspace(ylim[0], ylim[1], 30)
YY, XX = np.meshgrid(yy,xx)
xy = np.vstack([XX.ravel(), YY.ravel()]).T #垂直的堆
Z = clf.decision_function(xy).reshape(XX.shape)#把每个数据点属于每个类别的分数打出来
#把分类的决定边界画出来
ax.contour(XX, YY, Z, colors = 'k', levels = [-1, 0, 1], alpha = 0.5,
linestyles=['--', '-', '--']) #虚线,实线, 虚线
ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s = 100,
linewidth = 1, facecolors = 'none')
#查看模型准确率
clf.score(X_test, y_test)
#查看支持向量个数(每类分别由多少个)
clf.n_support_ #影响到路的宽度的点就是支持向量
#查看支持向量的坐标
clf.support_vectors_
kernel = 'rbf'
:
#创建一个RBF内核的支持向量机模型
clf_rbf = SVC(kernel = 'rbf', C = 1000)
clf_rbf.fit(X_train, y_train)
#结果可视化
plt.scatter(X_train[:, 0], X_train[:, 1], s = 120,
c = y_train, #按照标签着色
cmap = plt.cm.spring,
edgecolors = 'k'
)
ax = plt.gca()
xlim = ax.get_xlim()
ylim = ax.get_ylim()
xx = np.linspace(xlim[0], xlim[1], 30) #在两个点中间等间隔的取30个点
yy = np.linspace(ylim[0], ylim[1], 30)
YY, XX = np.meshgrid(yy,xx)
xy = np.vstack([XX.ravel(), YY.ravel()]).T #垂直的堆
Z = clf.decision_function(xy).reshape(XX.shape)#把每个数据点属于每个类别的分数打出来
#把分类的决定边界画出来
ax.contour(XX, YY, Z, colors = 'k', levels = [-1, 0, 1], alpha = 0.5,
linestyles=['--', '-', '--']) #虚线,实线, 虚线
ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s = 100,
linewidth = 1, facecolors = 'none')
C=10
:
#创建一个RBF内核的支持向量机模型 C = 10
clf_rbf = SVC(kernel = 'rbf', C = 10)
clf_rbf.fit(X_train, y_train)
#结果可视化
plt.scatter(X_train[:, 0], X_train[:, 1], s = 120,
c = y_train, #按照标签着色
cmap = plt.cm.spring,
edgecolors = 'k'
)
ax = plt.gca()
xlim = ax.get_xlim()
ylim = ax.get_ylim()
xx = np.linspace(xlim[0], xlim[1], 30) #在两个点中间等间隔的取30个点
yy = np.linspace(ylim[0], ylim[1], 30)
YY, XX = np.meshgrid(yy,xx)
xy = np.vstack([XX.ravel(), YY.ravel()]).T #垂直的堆
Z = clf.decision_function(xy).reshape(XX.shape)#把每个数据点属于每个类别的分数打出来
#把分类的决定边界画出来
ax.contour(XX, YY, Z, colors = 'k', levels = [-1, 0, 1], alpha = 0.5,
linestyles=['--', '-', '--']) #虚线,实线, 虚线
ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s = 100,
linewidth = 1, facecolors = 'none')
C= 1
:
#创建一个RBF内核的支持向量机模型 C = 1
clf_rbf = SVC(kernel = 'rbf', C = 1)
clf_rbf.fit(X_train, y_train)
#结果可视化
plt.scatter(X_train[:, 0], X_train[:, 1], s = 120,
c = y_train, #按照标签着色
cmap = plt.cm.spring,
edgecolors = 'k'
)
ax = plt.gca()
xlim = ax.get_xlim()
ylim = ax.get_ylim()
xx = np.linspace(xlim[0], xlim[1], 30) #在两个点中间等间隔的取30个点
yy = np.linspace(ylim[0], ylim[1], 30)
YY, XX = np.meshgrid(yy,xx)
xy = np.vstack([XX.ravel(), YY.ravel()]).T #垂直的堆
Z = clf.decision_function(xy).reshape(XX.shape)#把每个数据点属于每个类别的分数打出来
#把分类的决定边界画出来
ax.contour(XX, YY, Z, colors = 'k', levels = [-1, 0, 1], alpha = 0.5,
linestyles=['--', '-', '--']) #虚线,实线, 虚线
ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s = 100,
linewidth = 1, facecolors = 'none')
C= 0.1
:
#创建一个RBF内核的支持向量机模型 C = 0.1
clf_rbf = SVC(kernel = 'rbf', C = 0.1)
clf_rbf.fit(X_train, y_train)
#结果可视化
plt.scatter(X_train[:, 0], X_train[:, 1], s = 120,
c = y_train, #按照标签着色
cmap = plt.cm.spring,
edgecolors = 'k'
)
ax = plt.gca()
xlim = ax.get_xlim()
ylim = ax.get_ylim()
xx = np.linspace(xlim[0], xlim[1], 30) #在两个点中间等间隔的取30个点
yy = np.linspace(ylim[0], ylim[1], 30)
YY, XX = np.meshgrid(yy,xx)
xy = np.vstack([XX.ravel(), YY.ravel()]).T #垂直的堆
Z = clf.decision_function(xy).reshape(XX.shape)#把每个数据点属于每个类别的分数打出来
#把分类的决定边界画出来
ax.contour(XX, YY, Z, colors = 'k', levels = [-1, 0, 1], alpha = 0.5,
linestyles=['--', '-', '--']) #虚线,实线, 虚线
ax.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s = 100,
linewidth = 1, facecolors = 'none')
实践:SVM人脸识别
1.载入数据
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_lfw_people
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.decomposition import PCA #主成分分析
#载入数据
lfw_people = fetch_lfw_people(data_home = 'C:\\Users\\chengyijun\\scikit_learn_data\\', min_faces_per_person = 70, resize = 0.4, download_if_missing = False)
如果过程出现了含有forbidden
的提示语,请参考:调用 from sklearn.datasets import fetch_lfw_people 出现HTTPError 403错误
2.查看数据信息
print(lfw_people.DESCR)
plt.imshow(lfw_people.images[0], cmap = 'grey')
plt.show()
#查看照片的数据格式
n_samples, h, w = lfw_people.images.shape
print(n_samples)
print(h)
print(w)
lfw_people.data.shape
#查看数据标签
target_names = lfw_people.target_names
target_names
n_classes = lfw_people.target_names.shape[0]
n_classes
3.模型导入与训练
#拆分训练集与测试集
x_train, x_test, y_train, y_test = train_test_split(lfw_people.data, lfw_people.target)
# 建立SVM分类模型
## 模型实例化
model = SVC(kernel = 'rbf', class_weight= 'balanced')
## 模型训练
model.fit(x_train, y_train)
##模型预测
predictions = model.predict(x_test)
##模型评价
print(classification_report(y_test, predictions, target_names=lfw_people.target_names))
PCA降维
#100个维度
n_components = 100
#whiten:否数据做标准化
pca = PCA(n_components = n_components, whiten = True).fit(lfw_people.data)
x_train_pca = pca.transform(x_train)
x_test_pca = pca.transform(x_test)
x_train_pca.shape
model = SVC(kernel = 'rbf', class_weight = 'balanced')
model.fit(x_train_pca, y_train)
predictions = model.predict(x_test_pca)
print(classification_report(y_test, predictions, target_names=target_names))
网格搜索调参
param_grid = {'C':[0.1, 1, 5, 10, 100],
'gamma':[0.0005, 0.001, 0.005, 0.01]}
model = GridSearchCV(SVC(kernel = 'rbf', class_weight = 'balanced'), param_grid)
model.fit(x_train_pca, y_train)
print(model.best_estimator_)
predictions = model.predict(x_test_pca)
print(classification_report(y_test, predictions, target_names = target_names))
结果可视化
定义函数:
#画图, 3行5列
def plot_gallery(images, titles, h, w, n_row=3, n_col=5):
plt.figure(figsize = (1.8 * n_col, 2.4 * n_row))
plt.subplots_adjust(bottom = 0, left = .01, right = .99, hspace = .35)
for i in range(n_row * n_col):
plt.subplot(n_row, n_col, i+1)
plt.imshow(images[i].reshape((h, w)), cmap = plt.cm.gray)
plt.title(titles[i], size = 12)
plt.xticks(())
plt.yticks(())
#获取一张图片title
def title(predictions, y_test, target_names, i):
pred_name = target_names[predictions[i]].split(' ')[-1]
true_name = target_names[y_test[i]].split(' ')[-1]
#获取所有图片title
prediction_titles = [title(predictions, y_test, target_names, i) \
for i in range(len(predictions))]
#画图
plot_gallery(x_test, prediction_titles, h, w)
plt.show()