起步
示例一
from sklearn import svm
X = [[2, 0], [1, 1], [2,3]]
Y = [0, 0, 1]
clf = svm.SVC(kernel = 'linear')
clf.fit(X, Y)
# 查看模型
print(clf) # output: SVC(C=1.0, class_weight=None, ...)
# 打印出支持向量
print(clf.support_vectors_) # [[ 1. 1.], [ 2. 3.]]
# 支持向量在数据集中的索引
print(clf.support_) # output: [1, 2]
# 各类结果中的支持向量的个数
print(clf.n_support_) # output: [1, 1]
# 预测
print(clf.predict([[2, 2]])) # output: [1]
示例二
# coding: utf-8
import numpy as np
from sklearn import svm
np.random.seed(0) # 使用相同的seed()值,则每次生成的随即数都相同
# 创建可线性分类的数据集与结果集
X = np.r_[np.random.randn(20, 2) - [2, 2], np.random.randn(20,2) + [2, 2]]
Y = [0] * 20 + [1] * 20
# 构造 SVM 模型
clf = svm.SVC(kernel='linear')
clf.fit(X, Y) # 训练
w = clf.coef_[0]
a = -w[0] / w[1] # 斜率
xx = np.linspace(-5, 5) # 在区间[-5, 5] 中产生连续的值,用于画线
yy = a * xx - (clf.intercept_[0]) / w[1]
b = clf.support_vectors_[0] # 第一个分类的支持向量
yy_down = a * xx + (b[1] - a * b[0])
b = clf.support_vectors_[-1] # 第二个分类中的支持向量
yy_up = a * xx + (b[1] - a * b[0])
import pylab as pl
pl.plot(xx, yy, 'k-')
pl.plot(xx, yy_down, 'k--')
pl.plot(xx, yy_up, 'k--')
pl.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
s=80, facecolors='none')
pl.scatter(X[:, 0], X[:, 1], c=Y, cmap=pl.cm.Paired)
pl.axis('tight')
pl.show()
实例三:人脸识别
from sklearn.datasets import fetch_lfw_people
lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
人脸数据集基本信息
from sklearn.datasets import fetch_lfw_people
lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
n_samples, h, w = lfw_people.images.shape # 获取图像数据集的形状,绘图使用
# 获取特征数据集和结果集
X = lfw_people.data
Y = lfw_people.target
n_features = X.shape[1] # 特征的个数,或称为特征的维数
target_names = lfw_people.target_names # 数据集中有多少个人,以人名组成列表返回
n_classes = target_names.shape[0]
print("===== 数据集中信息 =====")
print("数据个数(n_samples):", n_samples) # output: 1288
print("特征个数,维度(n_features):", n_features) # output: 1859
print("结果集类别个数(n_classes):", n_classes) # output: 7
拆分训练集和测试集
from sklearn.model_selection import train_test_split
# 拆分训练集和测试集
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25)
特征降维处理
from sklearn.decomposition import PCA
n_components = 150
t0 = time.time()
pca = PCA(n_components=n_components, whiten=True).fit(X_train)
print("pca done %0.3fs" % (time.time() - t0))
t0 = time.time()
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
print("data set to pca done %0.3fs" % (time.time() - t0))
提取特征点
# 从人脸中提取特征点
eigenfaces = pca.components_.reshape((n_components, h, w))
构造 SVM 分类器
from sklearn import svm
from sklearn.model_selection import GridSearchCV
# 构造分类器
t0 = time.time()
param_grid = {
"C": [1e3, 5e3, 1e4, 1e5],
"gamma": [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1]
}
clf = GridSearchCV(svm.SVC(kernel='rbf', class_weight='balanced'), param_grid=param_grid)
clf.fit(X_train_pca, Y_train)
print("fit done %0.3fs" % (time.time() - t0))
print(clf.best_estimator_)
预测
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
# 预测
t0 = time.time()
y_pred = clf.predict(X_test_pca)
print(classification_report(Y_test, y_pred, target_names=target_names))
print(confusion_matrix(Y_test, y_pred, labels=range(n_classes)))
precision recall f1-score support
Ariel Sharon 0.78 0.70 0.74 20
Colin Powell 0.79 0.84 0.82 76
Donald Rumsfeld 0.81 0.71 0.76 31
George W Bush 0.85 0.91 0.88 125
Gerhard Schroeder 0.78 0.75 0.77 24
Hugo Chavez 0.94 0.83 0.88 18
Tony Blair 0.91 0.75 0.82 28
avg / total 0.83 0.83 0.83 322
[[ 14 3 0 2 1 0 0]
[ 2 64 1 9 0 0 0]
[ 2 2 22 5 0 0 0]
[ 0 8 3 114 0 0 0]
[ 0 1 1 2 18 1 1]
[ 0 1 0 0 1 15 1]
[ 0 2 0 2 3 0 21]]
测试结果可视化
import matplotlib.pyplot as plt
def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
"""Helper function to plot a gallery of portraits"""
plt.figure(figsize=(1.8 * n_col, 2.4 * n_row))
plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
for i in range(n_row * n_col):
plt.subplot(n_row, n_col, i + 1)
plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray)
plt.title(titles[i], size=12)
plt.xticks(())
plt.yticks(())
def title(y_pred, y_test, target_names, i):
pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1]
true_name = target_names[y_test[i]].rsplit(' ', 1)[-1]
return 'predicted: %s\ntrue: %s' % (pred_name, true_name)
prediction_titles = [title(y_pred, Y_test, target_names, i)
for i in range(y_pred.shape[0])]
plot_gallery(X_test, prediction_titles, h, w)
# plot the gallery of the most significative eigenfaces
eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])]
plot_gallery(eigenfaces, eigenface_titles, h, w)
plt.show()
总结