#!/usr/bin/env python
# encoding: utf-8
import matplotlib.pyplot as plt
import numpy as np
import time
'''
函数说明:读取文件当中的数据
'''
def loadDataSet(fileName):
dataMat = []
labelMat = []
fr = open(fileName)
for line in fr.readlines():
lineArr = line.strip().split('\t')
dataMat.append([float(lineArr[0]),float(lineArr[1])]) #添加数据
labelMat.append(float(lineArr[-1])) #添加标签
return dataMat,labelMat
"""
函数说明:只要函数值不等于输入值i,就会随机选择alpha
"""
def selectJrand(i,m):
j = i
while(j==i):
j = int(np.random.uniform(0,m))
return j
"""
函数说明:用于调整大于L或者小于H的alpha的值
"""
def clipAlpha(aj,H,L):
if aj > H:
aj = H
if aj < L:
aj = L
return aj
def smoSimple(dataMatIn,classLabels,C,toler,maxIter):
"""
函数说明:简化版的SMO
:param dataMatIn: 输入的数据集
:param classLabels: 数据的类别标签
:param C: 常数C 惩罚因子
:param toler: 容错率 松弛因子
:param maxIter: 最大迭代次数
:return:
"""
dataMatrix = np.mat(dataMatIn)
labelMat = np.mat(classLabels).transpose()
b = 0
m,n = np.shape(dataMatrix)
alphas = np.mat(np.zeros((m,1)))
iter_num = 0;
while(iter_num < maxIter):
alpha_pairs_changed = 0 #用于记录alpha的值是否已经进行优化
for i in range(m):
fXi = float(np.multiply(alphas,labelMat).T * (dataMatrix * dataMatrix[i,:].T)) + b
Ei = fXi - float(labelMat[i]) #误差项
#如果alpha可以更改,则进入优化过程,其中C是惩罚因子,toler是松弛因子,也称为容错率。无论是正间隔还是负间隔都会被测试
if ((labelMat[i] * Ei < -toler) and (alphas[i] < C)) or ((labelMat[i] * Ei > toler) and (alphas[i] > 0)):
#随机选择第二个alpha进行更新
j = selectJrand(i,m)
#开始进入SMO算法:
#步骤一:计算误差Ei
fXj = float(np.multiply(alphas,labelMat).T * (dataMatrix * dataMatrix[j,:].T)) + b
Ej = fXj - float(labelMat[j])
alpha_i_old = alphas[i].copy()
alpha_j_old = alphas[j].copy()
#步骤二:计算上下界L和H
if (labelMat[i] != labelMat[j]):
L = max(0,alphas[j] - alphas[i])
H = min(C,C + alphas[j] - alphas[i])
else:
L = max(0,alphas[j] + alphas[i] - C)
H = min(C,alphas[j] + alphas[i])
if L == H :
print('L = H') #如果L和H相等,不做任何变化,
continue
#步骤三:计算eta,(eta是aj的最优修改量)
eta = 2.0 * dataMatrix[i,:] * dataMatrix[j,:].T - dataMatrix[i,:] * dataMatrix[i,:].T \
- dataMatrix[j,:] * dataMatrix[j,:].T
#步骤四:对aj进行更新
if eta >= 0:
print("eta >= 0")
continue
alphas[j] = alphas[j] - labelMat[j] * (Ei - Ej) / eta
#步骤五:根据取值的范围修剪aj
alphas[j] = clipAlpha(alphas[j],H,L)
#更新ai
if(abs(alphas[j] - alpha_j_old) < 0.00001):
print('aj太小了')
continue
#步骤六:更新ai的值 对i进行修改,修改量与j相同,但是方向相反(也就是一个增加,另外一个就得减小)
alphas[i] += labelMat[i] * labelMat[j] *(alpha_j_old - alphas[j])
#步骤七:更新b1,b2
b1 = b - Ei - labelMat[i] * (alphas[i] - alpha_i_old) * dataMatrix[i,:] * dataMatrix[i,:].T - \
labelMat[j] * (alphas[j] - alpha_j_old) * dataMatrix[i,:] * dataMatrix[j,:].T
b2 = b - Ej - labelMat[i] * (alphas[i] - alpha_i_old) * dataMatrix[i,:] * dataMatrix[j,:].T - \
labelMat[j] * (alphas[j] - alpha_j_old) * dataMatrix[j, :] * dataMatrix[j,:].T
#步骤八:根据b1 b2的值更新b
if (0 < alphas[i]) and (alphas[i] < C):
b = b1
elif (0 < alphas[j]) and (alphas[j] < C):
b = b2
else:
b = (b1 + b2) / 2.0
alpha_pairs_changed += 1
print('第%d次迭代样本:%d,alpha的优化次数:%d' % (iter_num,i,alpha_pairs_changed))
if (alpha_pairs_changed == 0):
iter_num += 1
else:
iter_num = 0
print('迭代次数:%d' % iter_num)
return b,alphas
def get_w(dataMat,labelMat,alphas):
"""
计算该直线,也就是超平面的法向量
:param dataMat:
:param labelMat:
:param alphas:
:return: 法向量
"""
#将numpy数组转化成ndarray
alphas,dataMat,labelMat = np.array(alphas),np.array(dataMat),np.array(labelMat)
# 我们不知道labelMat的shape属性是多少,
# 但是想让labelMat变成只有一列,行数不知道多少,
# 通过labelMat.reshape(1, -1),Numpy自动计算出有100行,
# 新的数组shape属性为(100, 1)
# np.tile(labelMat.reshape(1, -1).T, (1, 2))将labelMat扩展为两列(将第1列复制得到第2列)
# dot()函数是矩阵乘,而*则表示逐个元素相乘
# w = sum(alpha_i * yi * xi)
w = np.dot((np.tile(labelMat.reshape(1,-1).T,(1,2)) * dataMat).T,alphas)
return w.tolist()
'''
函数说明:将分类结果可视化显示
'''
def showClassifer(dataMat,w,b):
data_plus = []
data_minus = []
dataSize = len(dataMat)
for i in range(dataSize):
if labelMat[i] > 0:
data_plus.append(dataMat[i])
else:
data_minus.append(dataMat[i])
#转成numpy矩阵
new_data_plus = np.array(data_plus)
new_data_minus = np.array(data_minus)
plt.scatter(np.transpose(new_data_plus)[0],np.transpose(new_data_plus)[1])
plt.scatter(np.transpose(new_data_minus)[0],np.transpose(new_data_minus)[1])
#绘制直线
x1 = max(dataMat)[0]
x2 = min(dataMat)[0]
a1,a2 = w
b = float(b)
a1 = float(a1[0])
a2 = float(a2[0])
y1,y2 = (-b - a1 *x1) /a2,(-b - a1 * x2) / a2
#画出直线
plt.plot([x1,x2],[y1,y2])
#找出支持向量的点
for i,alpha in enumerate(alphas):
#支持向量机的点
if(abs(alpha) > 0):
x , y = dataMat[i]
plt.scatter([x],[y],s = 150,c = 'none',alpha = 0.7,linewidths=2,edgecolors='green')
plt.show()
if __name__ =='__main__':
start = time.time()
dataMat,labelMat = loadDataSet('testSet.txt')
b,alphas = smoSimple(dataMat,labelMat,0.6,0.001,40)
w = get_w(dataMat,labelMat,alphas)
showClassifer(dataMat,w,b)
end = time.time()
print('Time used: %f' % (end - start))
《机器学习实战》-简化版的SMO算法
最新推荐文章于 2022-07-22 15:16:09 发布