SVDD试图学习一个包含单类样本点的超球边界,该边界既要满足包含足够多的的点,又要使得球体的半径要足够多的的少,而这两点往往又比较矛盾,因此,SVDD算法在这两个要求之间寻找一个人为的均衡。
网络上相关SVDD算法的解读,包括论文和概念,但是SVDD的方法代码在matlab库中的案例相对较多,而在python库中相对较少,更多的相关于OneClassSVM的代码,可参照sklearn的svm算法的案例。
在这里记录一组python代码的svdd算法,并结合鸢尾花数据进行单类分类,仅供参考,完整代码可参考:。
一、数据预处理
1. 首先加载所有的训练集和测试集数据:
import numpy as np
def load_train_data(path, minIndex, maxIndex, dim=4):
"""
:param path:
:param minIndex:
:param maxIndex:
:return:
"""
data = []
i = -1
with open(path) as f:
for line in f.readlines(): # get data by lines
if line == "\n": # empty line
break
i += 1
if minIndex <= i < maxIndex:
feature = line.split(',')
feature.pop() # abandon last element, because it is label
data.append(list(map(float, feature))) # append to data
return np.array(data)[:, 0:dim]
def load_test_data(path, type_num=0, minIndex=0, maxIndex=150, dim=4):
"""
:param path:数据集路径
:param type_num: 训练集的类型,范围[0, 2]
:param minIndex: 测试集的范围[minIndex, maxIndex)
:param maxIndex: 测试集的范围[minIndex, maxIndex)
:return: 测试集,标签
"""
data = []
label = []
i = -1
with open(path) as f:
for line in f.readlines(): # get data by lines
if line == "\n": # empty line
break
i += 1
if minIndex <= i < maxIndex:
feature = line.split(',')
feature.pop() # abandon last element, because it is label
data.append(list(map(float, feature))) # append to data
if type_num * 50 <= i < (type_num + 1) * 50:
label.append(1)
else:
label.append(0)
return np.array(data)[:, 0:dim], label
2. 设置模型的参数
type_num = 0
dim = 4
C = 0.1 #0.6
toler = 0.0001
maxIter = 40
best_acc = 0
best_a = 0
best_r = 0
best_label = []
3. 加载模型的所需数据
train_data = load_train_data('iris.data',
0, 30, dim=dim)
test_data, correct_label = load_test_data(
'iris.data', type_num, 30, 150, dim=dim)
train_data.shape, test_data.shape, len(correct_label), correct_label[:35],
# correct_label[-5:]
二、 SVDD算法
a为超球中心,R为半径。
import numpy as np
import random
def meet_limit_condition(alpha_i, data_i, a, R, C, toler):
"""
测试alphas[i]是否满足优化条件
:param alpha_i:alphas[i]
:param data_i:data_array[i]
:param a:中心点
:param R:半径
:param C:惩罚因子
:param toler:容忍度
:return:满足优化条件则返回True,否则返回False
"""
# if abs(R ** 2 - np.dot((data_i - a), (data_i - a))) > toler and 0 < alpha_i < C:
Ei = R ** 2 - np.dot((data_i - a), (data_i - a))
if (Ei < -toler and alpha_i < C) or (Ei > toler and alpha_i > 0):
return True
else:
return False
def selectJrand(i, m):
"""
随机选择一个整数
Args:
i 第一个alpha的下标
m 所有alpha的数目
Returns:
j 返回一个不为i的随机数,在0~m之间的整数值
"""
j = i
while j == i:
j = int(random.uniform(0, m))
return j
def calculate_alpha_j(data_array, alphas, i, j, a):
"""
data_array: 测试集
alphas: 旧的alphas值
i, j: 当前选出的将要进行优化的alpha的下标
返回值: 新的alphas[j]值
"""
a1 = np.array(a)
x1 = np.array(data_array[i])
x2 = np.array(data_array[j])
x12 = np.dot(x1, x2)
x1_2 = np.dot(x1, x1)
x2_2 = np.dot(x2, x2)
nu = np.dot(a1, x2) - x2_2 - np.dot(a1, x1) + x1_2 + \
alphas[i] * (x12 + x1_2) + alphas[j] * (x1_2 - x2_2 + 3 * x12)
de = 2 * (x1_2 + x2_2 - 2 * x12)
if de == 0:
return 0, False
return -nu / de, True
def calculate_alpha_i(alphas, i):
"""
alphas: 新的alpha数组
i: 要更新的alpha值的下标
返回值: 新的alphas[i]
"""
alpha_sum = alphas.sum() - alphas[i]
return 1 - alpha_sum
def smo(train_data, C=0.6, toler=0.001, maxIter=40):
data_array = np.array(train_data)
m, n = np.shape(data_array)
alphas = np.array([1 / m] * m)
R = 0
a = np.array([0.0] * n)
for i in range(m):
a += alphas[i] * data_array[i]
iter = 0
while iter < maxIter:
changed_flag = 0
for i in range(m):
if meet_limit_condition(alphas[i], data_array[i], a, R, C, toler):
j = selectJrand(i, m)
L = max(0, alphas[i] + alphas[j] - C)
H = min(C, alphas[i] + alphas[j])
if L == H:
continue
new_alpha_j, valid = calculate_alpha_j(
data_array, alphas, i, j, a)
if not valid:
continue
if new_alpha_j < L:
new_alpha_j = L
elif new_alpha_j > H:
new_alpha_j = H
if abs(new_alpha_j - alphas[j]) < 0.001:
continue
else:
alphas[j] = new_alpha_j
alphas[i] = calculate_alpha_i(alphas, i)
changed_flag += 1
# check_alphas(alphas, C)
a, R = calculate_a_and_R(data_array, alphas, i, j, C)
if changed_flag == 0:
iter += 1
else:
iter = 0
return a, R
def check_alphas(alphas, C):
"""
检测alphas是否符合要求
:param alphas:alphas
:param C:惩罚因子
:return:符合返回True,否则返回False
"""
a_sum = 0
for i in range(alphas.shape[0]):
if alphas[i] < -0.0001:
print("alphas" + str(i) + ":" + str(alphas[i]) + " < 0")
if alphas[i] > C + 0.0001:
print("alphas" + str(i) + ":" + str(alphas[i]) + " > C")
a_sum += alphas[i]
if abs(a_sum - 1) > 0.0001:
print("alphas sum != 1")
return False
else:
return True
def calculate_a_and_R(data_array, alphas, i, j, C):
"""
计算a, R
:param data_array:
:param alphas:
:param i:
:param j:
:param C:
:return:
"""
m, n = np.shape(data_array)
a = [0] * n
for l in range(m):
a += data_array[l] * alphas[l]
R1 = np.sqrt(np.dot(data_array[i] - a, data_array[i] - a))
R2 = np.sqrt(np.dot(data_array[j] - a, data_array[j] - a))
if 0 < alphas[i] < C:
R = R1
elif 0 < alphas[j] < C:
R = R2
else:
R = (R1 + R2) / 2.0
return a, R
三、 测试与结果
代码如下:
# import load_data
# import one_class_svm
import numpy as np
from matplotlib import pyplot as plt
def judge(test_data, a, R):
m, n = np.shape(test_data)
label = []
for i in range(m):
if np.dot(test_data[i] - a, test_data[i] - a) <= R ** 2:
label.append(1)
else:
label.append(0)
return label
def calculate_acc(result_label, correct_label):
"""
返回result_label与correct_label相同的比例
:return: acc = (true positive + false positive)/all
"""
if len(result_label) != len(correct_label):
return -1
acc = 0
for i in range(len(result_label)):
if result_label[i] == correct_label[i]:
acc += 1
return acc / len(result_label)
def draw_picture(train_data, test_data, correct_label, a, R, C, toler, acc):
plt.figure()
plt.scatter(test_data[:, 0], test_data[:, 1], c=correct_label)
plt.scatter(train_data[:, 0], train_data[:, 1], c='r')
plt.title("C = " + str(C) + " toler = " + str(toler) +
" R = " + str(R)[0:4] + " acc = " + str(acc)[:4])
theta = np.arange(0, 2 * np.pi, 0.01)
x = a[0] + R * np.cos(theta)
y = a[1] + R * np.sin(theta)
plt.plot(x, y)
plt.show()
以鸢尾花数据为例进行举例:
def adjust_param():
# 以第几类数据作为训练集
type_num = 0
dim = 4
C = 0.04 #0.6
toler = 0.0001
maxIter = 40
best_acc = 0
best_a = 0
best_r = 0
best_label = []
# 数据预处理
if type_num == 0:
train_data = load_train_data('data', 0, 30, dim=dim)
test_data, correct_label = load_test_data(
'data', type_num, 30, 150, dim=dim)
elif type_num == 1:
train_data = load_train_data('data', 50, 80)
test_data1, correct_label1 = load_test_data(
'data', type_num, 80, 150, dim=dim)
test_data2, correct_label2 = load_test_data(
'data', type_num, 0, 50, dim=dim)
test_data = np.vstack((test_data1, test_data2))
correct_label = np.hstack((correct_label1, correct_label2))
elif type_num == 2:
train_data = load_train_data('data', 100, 130, dim=dim)
test_data1, correct_label1 = load_test_data(
'data', type_num, 130, 150, dim=dim)
test_data2, correct_label2 = load_test_data(
'data', type_num, 0, 100, dim=dim)
test_data = np.vstack((test_data1, test_data2))
correct_label = np.hstack((correct_label1, correct_label2))
min_acc = 2
avrg_acc = 0
max_acc = -1
for i in range(50):
a, R = smo(train_data, C, toler, maxIter)
result_label = judge(test_data, a, R)
acc = calculate_acc(result_label, correct_label)
if acc > best_acc:
best_acc = acc
best_a = a
best_r = R
best_label = result_label
avrg_acc += acc
if acc < min_acc:
min_acc = acc
if acc > max_acc:
max_acc = acc
#print("accuracy: " + str(acc))
avrg_acc /= 100
print("train type:" + str(type_num) + ", dim=" +
str(dim) + " => best acc = " + str(max_acc))
print("model: a="+str(best_a)+", R="+str(best_r) + ",C="+str(C))
print("label(0-20:positive sample):")
print(best_label)
draw_picture(train_data, test_data, correct_label,
best_a, best_r, C, toler, best_acc)
if __name__ == "__main__":
adjust_param()
输出结果表示如下:
对其中两个维度分类结果如下:
分类准确度还是较高的,该结果若提升,可在核函和优化方法上进行改进,这里不继续写了。
上述代码基本可实现svdd基础功能,若需要整体源码,可以考虑上传。