# -*- coding:UTF-8 -*-
import matplotlib.pyplot as plt
import numpy as np
import random
'''
数据读取
'''
def loadDataSet(fileName):
dataMat = []; labelMat = []
fr = open(fileName,'rb')
for line in fr.readlines():
lineArr = line.decode('utf-8-sig').strip().split('\t')
dataMat.append([float(lineArr[0]),float(lineArr[1])])
labelMat.append(float(lineArr[2]))
# print(dataMat)
# print(labelMat)
return dataMat,labelMat
'''
随机选择alpha
i:alpha下标
m:alpha参数全部个数
'''
def selectJrand(i,m):
j = i
while(j==i):
j = int(random.uniform(0,m)) #选择一个不等于i的j
return j
'''
根据取值范围修剪aj
H:取值上限
L:取值下限
'''
def clipAlaph(aj,H,L):
if aj > H:
aj = H
if L > aj:
aj = L
return aj
'''
简化版SMO
C:松弛变量 toler:容错率 maxIter:最大迭代次数
1、计算误差
2.上下限计算
3、计算学习速率
4、更新alpha_j
5、修剪alpha_j
6、更新alpha_i
7、更新b1、b2
'''
def smoSimple(dataMatIn,classLabels,C,toler,maxIter):
dataMat = np.mat(dataMatIn);labelMat = np.mat(classLabels).transpose() #转换为numpy的mat存储
b=0 #初始化b
m,n = np.shape(dataMat) #统计dataMat的维度
alphas = np.mat(np.zeros((m,1))) #初始化alpha参数为0
iter_num = 0 #初始化迭代次数
while(iter_num < maxIter):
alphaPairsChanged = 0 #用于记录alpha是否已经进行优化
for i in range(m):
#multiply对应元素相乘
fxi = float(np.multiply(alphas,labelMat).T*(dataMat*dataMat[i,:].T)) + b
Ei = fxi - float(labelMat[i]) #计算误差
#优化alpha
if ((labelMat[i]*Ei < -toler) and (alphas[i] < C)) or ((labelMat[i]*Ei > toler) and (alphas[i] > C)):
#随机选择一个alpha_j
j = selectJrand(i,m)
#1、计算Ej误差
fxj = float(np.multiply(alphas, labelMat).T * (dataMat * dataMat[j, :].T)) + b
Ej = fxj - float(labelMat[j])
#保存更新前的alpha,使用深拷贝
alphaIold = alphas[i].copy()
alphaJold = alphas[j].copy()
#2、计算上下界L和H
if(labelMat[i] != labelMat[j]):
L = max(0,alphas[j]-alphas[i])
H = min(C,C+alphas[j]-alphas[i])
else:
L = max(0, alphas[j] + alphas[i] - C)
H = min(C, C + alphas[j] + alphas[i])
if L==H:print('L==H');continue
#3、计算eta
eta = 2.0 * dataMat[i,:]*dataMat[j,:].T - dataMat[i,:]*dataMat[i,:].T - dataMat[j,:]*dataMat[j,:].T
if eta >= 0:
print('eta>=0');continue
#4、更新alphas[j]
alphas[j] -= labelMat[j]*(Ei - Ej)/eta
#修剪alpha[j]
alphas[j] = clipAlaph(alphas[j],H,L)
if (abs(alphas[j]-alphaJold) < 0.00001):
print('alpha[j]变化太小')
continue
#5、更新alpha[i]
alphas[i] += labelMat[j]*labelMat[i]*(alphaJold-alphas[j])
#6、更新b1,b2
b1 = b - Ei - labelMat[i]*(alphas[i]-alphaIold)*dataMat[i,:]*dataMat[i,:].T - labelMat[j]*(alphas[j]-alphaJold)*dataMat[i,:]*dataMat[j,:].T
b2 = b - Ej - labelMat[i]*(alphas[i]-alphaIold)*dataMat[i,:]*dataMat[j,:].T - labelMat[j]*(alphas[j]-alphaJold)*dataMat[j,:]*dataMat[j,:].T
#7、更新b
if (0<alphas[i]) and (C >alphas[i]):
b=b1
elif(0<alphas[j]) and (C >alphas[j]):
b=b2
else:
b=(b1+b2)/2
#统计优化次数
alphaPairsChanged += 1
print('第%d次迭代样本:%d,alpha优化次数:%d' %(iter_num,i,alphaPairsChanged))
#更新迭代次数
if(alphaPairsChanged == 0):iter_num+=1
else:iter_num = 0
print('迭代次数:%d' %iter_num)
return b,alphas
'''
函数说明:计算w
'''
def get_w(dataMat,labelMat,alphas):
alphas,dataMat,labelMat = np.array(alphas),np.array(dataMat),np.array(labelMat)
w = np.dot((np.tile(labelMat.reshape(1,-1).T,(1,2))*dataMat).T,alphas)
return w.tolist()
"""
函数说明:数据可视化
Parameters:
dataMat - 数据矩阵
labelMat - 数据标签
"""
def showDataSet(dataMat, labelMat,alphas,w,b):
data_plus = [] #正样本
data_minus = [] #负样本
for i in range(len(dataMat)):
if labelMat[i] > 0:
data_plus.append(dataMat[i])
else:
data_minus.append(dataMat[i])
data_plus_np = np.array(data_plus) #转换为numpy队列
data_minus_np = np.array(data_minus) #转换为numpy队列
plt.scatter(np.transpose(data_plus_np)[0], np.transpose(data_plus_np)[1]) #正样本散点图
plt.scatter(np.transpose(data_minus_np)[0], np.transpose(data_minus_np)[1]) #负样本散点图
x1 = max(dataMat)[0]
x2 = min(dataMat)[0]
a1,a2 = w
b = float(b)
a1 = float(a1[0])
a2 = float(a2[0])
y1, y2 = (b + a1 * x1), (b + a1 * x2)
plt.plot([x1,x2],[y1,y2])
#找到支持向量
for i,alpha in enumerate(alphas):
if alpha > 0:
x,y = dataMat[i]
plt.scatter([x], [y], s=150, c='none', alpha=0.7, linewidth=1.5, edgecolor='red')
plt.show()
if __name__ == '__main__':
dataMat,labelMat = loadDataSet('G:\dataSet.txt')
b,alphas = smoSimple(dataMat,labelMat,0.6,0.001,80)
w = get_w(dataMat,labelMat,alphas)
showDataSet(dataMat,labelMat,alphas,w,b)
运行结果: