1.概念
2.代码
鸢尾花
SVM.Intro.py
#!/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split
import matplotlib as mpl
import matplotlib.pyplot as plt
def iris_type(s):
it = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}
return it[s]
# 'sepal length', 'sepal width', 'petal length', 'petal width'
iris_feature = u'花萼长度', u'花萼宽度', u'花瓣长度', u'花瓣宽度'
def show_accuracy(a, b, tip):
acc = a.ravel() == b.ravel()
print tip + '正确率:', np.mean(acc)
if __name__ == "__main__":
path = '..\\8.Regression\\8.iris.data' # 数据文件路径
data = np.loadtxt(path, dtype=float, delimiter=',', converters={4: iris_type})
x, y = np.split(data, (4,), axis=1)
x = x[:, :2]
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, train_size=0.6)
# 分类器
# clf = svm.SVC(C=0.1, kernel='linear', decision_function_shape='ovr')
clf = svm.SVC(C=0.8, kernel='rbf', gamma=20, decision_function_shape='ovr') #设置参数
clf.fit(x_train, y_train.ravel()) # 训练
# 准确率
print clf.score(x_train, y_train) # 打印精度
y_hat = clf.predict(x_train) #预测
show_accuracy(y_hat, y_train, '训练集') #打印正确率
print clf.score(x_test, y_test)
y_hat = clf.predict(x_test)
show_accuracy(y_hat, y_test, '测试集')
# 画图
x1_min, x1_max = x[:, 0].min(), x[:, 0].max() # 第0列的范围
x2_min, x2_max = x[:, 1].min(), x[:, 1].max() # 第1列的范围
x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j] # 生成网格采样点
grid_test = np.stack((x1.flat, x2.flat), axis=1) # 测试点
Z = clf.decision_function(grid_test) # 样本点到决策面的距离
print Z
grid_hat = clf.predict(grid_test) # 预测分类值
print grid_hat
grid_hat = grid_hat.reshape(x1.shape) # 使之与输入的形状相同
#中文
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False
# 设置颜色
cm_light = mpl.colors.ListedColormap(['#A0FFA0', '#FFA0A0', '#A0A0FF'])
cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b'])
x1_min, x1_max = x[:, 0].min(), x[:, 0].max() # 第0列的范围
x2_min, x2_max = x[:, 1].min(), x[:, 1].max() # 第1列的范围
x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j] # 生成网格采样点
grid_test = np.stack((x1.flat, x2.flat), axis=1) # 测试点
plt.pcolormesh(x1, x2, grid_hat, cmap=cm_light)
plt.scatter(x[:, 0], x[:, 1], c=y, edgecolors='k', s=50, cmap=cm_dark) # 样本
plt.scatter(x_test[:, 0], x_test[:, 1], s=120, facecolors='none', zorder=10) # 圈中测试集样本
plt.xlabel(iris_feature[0], fontsize=13)
plt.ylabel(iris_feature[1], fontsize=13)
plt.xlim(x1_min, x1_max)
plt.ylim(x2_min, x2_max)
plt.title(u'鸢尾花SVM二特征分类', fontsize=15)
plt.grid()
plt.show()
SVM_draw.py
#!/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
from sklearn import svm
import matplotlib as mpl
import matplotlib.colors
import matplotlib.pyplot as plt
def show_accuracy(a, b):
acc = a.ravel() == b.ravel()
print '正确率:%.2f%%' % (100*float(acc.sum()) / a.size)
if __name__ == "__main__":
data = np.loadtxt('14.bipartition.txt', dtype=np.float, delimiter='\t') #读入数据
x, y = np.split(data, (2, ), axis=1) #第二列
y[y == 0] = -1
y = y.ravel() #向量形式
# 分类器
clfs = [svm.SVC(C=0.3, kernel='linear'), #不同核不同参数
svm.SVC(C=10, kernel='linear'),
svm.SVC(C=5, kernel='rbf', gamma=1),
svm.SVC(C=5, kernel='rbf', gamma=4)]
titles = 'Linear,C=0.3', 'Linear, C=10', 'RBF, gamma=1', 'RBF, gamma=4'
x1_min, x1_max = x[:, 0].min(), x[:, 0].max() # 第0列的范围
x2_min, x2_max = x[:, 1].min(), x[:, 1].max() # 第1列的范围
x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j] # 生成网格采样点
grid_test = np.stack((x1.flat, x2.flat), axis=1) # 测试点
cm_light = matplotlib.colors.ListedColormap(['#77E0A0', '#FF8080'])
cm_dark = matplotlib.colors.ListedColormap(['g', 'r'])
matplotlib.rcParams['font.sans-serif'] = [u'SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False
plt.figure(figsize=(10,8), facecolor='w') #窗口大小和颜色
for i, clf in enumerate(clfs):
clf.fit(x, y) #训练模型
y_hat = clf.predict(x)
show_accuracy(y_hat, y) # 准确率
# 画图
print '支撑向量的数目:', clf.n_support_
print '支撑向量的系数:', clf.dual_coef_
print '支撑向量:', clf.support_
print
plt.subplot(2, 2, i+1)
grid_hat = clf.predict(grid_test) # 预测分类值
grid_hat = grid_hat.reshape(x1.shape) # 使之与输入的形状相同
plt.pcolormesh(x1, x2, grid_hat, cmap=cm_light, alpha=0.8)
plt.scatter(x[:, 0], x[:, 1], c=y, edgecolors='k', s=40, cmap=cm_dark) # 样本的显示
plt.scatter(x[clf.support_, 0], x[clf.support_, 1], edgecolors='k', facecolors='none', s=100, marker='o') # 支撑向量
z = clf.decision_function(grid_test)
z = z.reshape(x1.shape)
plt.contour(x1, x2, z, colors=list('krk'), linestyles=['--', '-', '--'], linewidths=[1, 2, 1], levels=[-1, 0, 1])
plt.xlim(x1_min, x1_max)
plt.ylim(x2_min, x2_max)
plt.title(titles[i])
plt.grid()
plt.suptitle(u'SVM不同参数的分类', fontsize=18)
plt.tight_layout(2)
plt.subplots_adjust(top=0.92)
plt.show()
ClassifierIndex.py
#!/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
from sklearn.metrics import accuracy_score #度量标准
from sklearn.metrics import precision_score, recall_score, f1_score, fbeta_score
from sklearn.metrics import precision_recall_fscore_support
if __name__ == "__main__":
y_true = np.array([1, 1, 1, 1, 0, 0]) #实际值
y_hat = np.array([1, 0, 1, 1, 1, 1]) #估计值
print 'Accuracy:\t', accuracy_score(y_true, y_hat)
#精确率
# The precision is the ratio 'tp / (tp + fp)' where 'tp' is the number of
# true positives and 'fp' the number of false positives. The precision is
# intuitively the ability of the classifier not to label as positive a sample
# that is negative.
# The best value is 1 and the worst value is 0.
precision = precision_score(y_true, y_hat)
print 'Precision:\t', precision
#查全率
# The recall is the ratio 'tp / (tp + fn)' where 'tp' is the number of
# true positives and 'fn' the number of false negatives. The recall is
# intuitively the ability of the classifier to find all the positive samples.
# The best value is 1 and the worst value is 0.
recall = recall_score(y_true, y_hat)
print 'Recall: \t', recall
# F1值
# F1 score, also known as balanced F-score or F-measure
# The F1 score can be interpreted as a weighted average of the precision and
# recall, where an F1 score reaches its best value at 1 and worst score at 0.
# The relative contribution of precision and recall to the F1 score are
# equal. The formula for the F1 score is:
# F1 = 2 * (precision * recall) / (precision + recall)
print 'f1 score: \t', f1_score(y_true, y_hat)
#print 2 * (precision * recall) / (precision + recall)
# F-beta值
# The F-beta score is the weighted harmonic mean of precision and recall,
# reaching its optimal value at 1 and its worst value at 0.
# The 'beta' parameter determines the weight of precision in the combined
# score. 'beta < 1' lends more weight to precision, while 'beta > 1'
# favors recall ('beta -> 0' considers only precision, 'beta -> inf' only recall).
print 'F-beta:'
for beta in np.logspace(-3, 3, num=7, base=10): #构造等比数列,不同的beta值
fbeta = fbeta_score(y_true, y_hat, beta=beta)
print '\tbeta=%9.3f\tF-beta=%.5f' % (beta, fbeta)
#print (1+beta**2)*precision*recall / (beta**2 * precision + recall)
print precision_recall_fscore_support(y_true, y_hat, beta=1) #另一种调用方法
当出现不同类别之间的数量相差十分巨大的情况
unBalance.py
#!/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
from sklearn import svm
import matplotlib.colors
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score
import warnings
def show_accuracy(a, b):
acc = a.ravel() == b.ravel()
print '正确率:%.2f%%' % (100*float(acc.sum()) / a.size)
def show_recall(y, y_hat):
# print y_hat[y == 1]
print '召回率:%.2f%%' % (100 * float(np.sum(y_hat[y == 1] == 1)) / np.extract(y == 1, y).size)
if __name__ == "__main__":
warnings.filterwarnings("ignore") # UndefinedMetricWarning
np.random.seed(0) # 保持每次生成的数据相同
c1 = 990
c2 = 10
N = c1 + c2
x_c1 = 3*np.random.randn(c1, 2) #做了c1行,2列的高斯数据
x_c2 = 0.5*np.random.randn(c2, 2) + (4, 4) #选定4,4点处c2行2列的数据,改变其值
x = np.vstack((x_c1, x_c2)) #数据整合
y = np.ones(N) #全1
y[:c1] = -1 #后十个为-1
# 显示大小
s = np.ones(N) * 30
s[:c1] = 10
# 分类器:线性核/高斯核
clfs = [svm.SVC(C=1, kernel='linear'),
svm.SVC(C=1, kernel='linear', class_weight={-1: 1, 1: 50}), #设置大小类的个数
svm.SVC(C=0.8, kernel='rbf', gamma=0.5, class_weight={-1: 1, 1: 2}),
svm.SVC(C=0.8, kernel='rbf', gamma=0.5, class_weight={-1: 1, 1: 10})]
titles = 'Linear', 'Linear, Weight=50', 'RBF, Weight=2', 'RBF, Weight=10'
x1_min, x1_max = x[:, 0].min(), x[:, 0].max() # 第0列的范围
x2_min, x2_max = x[:, 1].min(), x[:, 1].max() # 第1列的范围
x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j] # 生成网格采样点
grid_test = np.stack((x1.flat, x2.flat), axis=1) # 测试点
cm_light = matplotlib.colors.ListedColormap(['#77E0A0', '#FF8080'])
cm_dark = matplotlib.colors.ListedColormap(['g', 'r'])
matplotlib.rcParams['font.sans-serif'] = [u'SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False
plt.figure(figsize=(10, 8), facecolor='w')
for i, clf in enumerate(clfs):
clf.fit(x, y)
y_hat = clf.predict(x)
# show_accuracy(y_hat, y) # 正确率
# show_recall(y, y_hat) # 召回率
print i+1, '次:'
print '正确率:\t', accuracy_score(y, y_hat)
print ' 精度 :\t', precision_score(y, y_hat, pos_label=1)
print '召回率:\t', recall_score(y, y_hat, pos_label=1)
print 'F1Score:\t', f1_score(y, y_hat, pos_label=1)
print
# 画图
plt.subplot(2, 2, i+1)
grid_hat = clf.predict(grid_test) # 预测分类值
grid_hat = grid_hat.reshape(x1.shape) # 使之与输入的形状相同
plt.pcolormesh(x1, x2, grid_hat, cmap=cm_light, alpha=0.8)
plt.scatter(x[:, 0], x[:, 1], c=y, edgecolors='k', s=s, cmap=cm_dark) # 样本的显示
plt.xlim(x1_min, x1_max)
plt.ylim(x2_min, x2_max)
plt.title(titles[i])
plt.grid()
plt.suptitle(u'不平衡数据的处理', fontsize=18)
plt.tight_layout(1.5)
plt.subplots_adjust(top=0.92)
plt.show()
0-9数字图像分类
数据中每行前8*8代表图像,最后一位代表数值
HandWrittenDigits.py
#!/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
from sklearn import svm
import matplotlib.colors
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.metrics import accuracy_score
import os
def show_accuracy(a, b, tip):
acc = a.ravel() == b.ravel()
print tip + '正确率:%.2f%%' % (100*np.mean(acc))
def save_image(im, i):
im *= 15.9375
im = 255 - im
a = im.astype(np.uint8)
output_path = '.\\HandWritten'
if not os.path.exists(output_path):
os.mkdir(output_path)
Image.fromarray(a).save(output_path + ('\\%d.png' % i))
if __name__ == "__main__":
print 'Load Training File Start...'
data = np.loadtxt('14.optdigits.tra', dtype=np.float, delimiter=',')
x, y = np.split(data, (-1, ), axis=1)
images = x.reshape(-1, 8, 8) #使数据变成若干个8*8的数据块
y = y.ravel().astype(np.int)
print 'Load Test Data Start...'
data = np.loadtxt('14.optdigits.tes', dtype=np.float, delimiter=',')
x_test, y_test = np.split(data, (-1, ), axis=1) #前面的给x,最后一列给y
images_test = x_test.reshape(-1, 8, 8)
y_test = y_test.ravel().astype(np.int) #载入数据
print 'Load Data OK...'
# x, x_test, y, y_test = train_test_split(x, y, random_state=1)
# images = x.reshape(-1, 8, 8)
# images_test = x_test.reshape(-1, 8, 8)
#显示数据,确保读取正确
matplotlib.rcParams['font.sans-serif'] = [u'SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False
plt.figure(figsize=(15, 9), facecolor='w') #大小颜色
for index, image in enumerate(images[:16]): #取前16个显示
plt.subplot(4, 8, index + 1) #显示4*8
plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest') #用灰度表示大小显示图片,用近邻方式
plt.title(u'训练图片: %i' % y[index]) #显示图片的值
for index, image in enumerate(images_test[:16]): #测试的前16个
plt.subplot(4, 8, index + 17)
plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
save_image(image.copy(), index) #保存数据,带索引
plt.title(u'测试图片: %i' % y_test[index])
plt.tight_layout()
plt.show()
clf = svm.SVC(C=1, kernel='rbf', gamma=0.001) # 确定参数,当gamma很小时,类似k近邻~ kNN
print 'Start Learning...'
clf.fit(x, y) #训练
print 'Learning is OK...'
y_hat = clf.predict(x) #训练数据的估计
show_accuracy(y, y_hat, '训练集')
y_hat = clf.predict(x_test) # 测试数据的估计
print y_hat
print y_test
show_accuracy(y_test, y_hat, '测试集')
err_images = images_test[y_test != y_hat] #提取出预测错误的图片
err_y_hat = y_hat[y_test != y_hat] #提取预测错误的值
err_y = y_test[y_test != y_hat]
print err_y_hat
print err_y
plt.figure(figsize=(10, 8), facecolor='w') #打印错误图片
for index, image in enumerate(err_images):
if index >= 12: #只显示前12个,3*4
break
plt.subplot(3, 4, index + 1)
plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
plt.title(u'错分为:%i,真实值:%i' % (err_y_hat[index], err_y[index])) # 错误值和真实值
plt.tight_layout()
plt.show()
用支持向量的方式做回归
SVR.py
#!/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
from sklearn import svm
import matplotlib.pyplot as plt
if __name__ == "__main__":
N = 50
np.random.seed(0)
x = np.sort(np.random.uniform(0, 6, N), axis=0) #0-6的均匀分布,然后再排序
y = 2*np.sin(x) + 0.1*np.random.randn(N) #加小的噪声
x = x.reshape(-1, 1) #变成一列
print 'x =\n', x
print 'y =\n', y
print 'SVR - RBF'
svr_rbf = svm.SVR(kernel='rbf', gamma=0.2, C=100) #RBF核
svr_rbf.fit(x, y) #训练
print 'SVR - Linear'
svr_linear = svm.SVR(kernel='linear', C=100) #线性核
svr_linear.fit(x, y) #训练
print 'SVR - Polynomial'
svr_poly = svm.SVR(kernel='poly', degree=3, C=100) #多项式核
svr_poly.fit(x, y) #训练
print 'Fit OK.'
# 思考:系数1.1改成1.5
x_test = np.linspace(x.min(), 1.1*x.max(), 100).reshape(-1, 1) #生成数据
y_rbf = svr_rbf.predict(x_test) #预测
y_linear = svr_linear.predict(x_test)
y_poly = svr_poly.predict(x_test)
plt.figure(figsize=(9, 8), facecolor='w')
plt.plot(x_test, y_rbf, 'r-', linewidth=2, label='RBF Kernel')
plt.plot(x_test, y_linear, 'g-', linewidth=2, label='Linear Kernel')
plt.plot(x_test, y_poly, 'b-', linewidth=2, label='Polynomial Kernel')
plt.plot(x, y, 'mo', markersize=6)
plt.scatter(x[svr_rbf.support_], y[svr_rbf.support_], s=130, c='r', marker='*', label='RBF Support Vectors') #用特殊符号标记支撑向量
plt.legend(loc='lower left')
plt.title('SVR', fontsize=16)
plt.xlabel('X')
plt.ylabel('Y')
plt.grid(True)
plt.show()
CV.py
# !/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
from sklearn import svm
from sklearn.model_selection import GridSearchCV # 0.17 grid_search网格搜索
import matplotlib.pyplot as plt
if __name__ == "__main__":
N = 50
np.random.seed(0)
x = np.sort(np.random.uniform(0, 6, N), axis=0)
y = 2*np.sin(x) + 0.1*np.random.randn(N)
x = x.reshape(-1, 1)
print 'x =\n', x
print 'y =\n', y
model = svm.SVR(kernel='rbf') #模型建立
c_can = np.logspace(-2, 2, 10)
gamma_can = np.logspace(-2, 2, 10)
svr = GridSearchCV(model, param_grid={'C': c_can, 'gamma': gamma_can}, cv=5) #网格搜索,遍历c_can和gammma_can的所有值,5次交叉验证
svr.fit(x, y) #用得到的分类器训练
print '验证参数:\n', svr.best_params_ # 打印出使用的最好的参数
x_test = np.linspace(x.min(), x.max(), 100).reshape(-1, 1)
y_hat = svr.predict(x_test)
sp = svr.best_estimator_.support_
plt.figure(facecolor='w')
plt.scatter(x[sp], y[sp], s=120, c='r', marker='*', label='Support Vectors', zorder=3)
plt.plot(x_test, y_hat, 'r-', linewidth=2, label='RBF Kernel')
plt.plot(x, y, 'go', markersize=5)
plt.legend(loc='upper right')
plt.title('SVR', fontsize=16)
plt.xlabel('X')
plt.ylabel('Y')
plt.grid(True)
plt.show()