单类分类器 - SVDD算法(Python)

SVDD试图学习一个包含单类样本点的超球边界,该边界既要满足包含足够多的的点,又要使得球体的半径要足够多的的少,而这两点往往又比较矛盾,因此,SVDD算法在这两个要求之间寻找一个人为的均衡。

网络上相关SVDD算法的解读,包括论文和概念,但是SVDD的方法代码在matlab库中的案例相对较多,而在python库中相对较少,更多的相关于OneClassSVM的代码,可参照sklearn的svm算法的案例。

在这里记录一组python代码的svdd算法,并结合鸢尾花数据进行单类分类,仅供参考,完整代码可参考:。

一、数据预处理

1. 首先加载所有的训练集和测试集数据:

import numpy as np


def load_train_data(path, minIndex, maxIndex, dim=4):
    """

    :param path:
    :param minIndex:
    :param maxIndex:
    :return:
    """
    data = []

    i = -1
    with open(path) as f:
        for line in f.readlines():  # get data by lines
            if line == "\n":  # empty line
                break
            i += 1
            if minIndex <= i < maxIndex:
                feature = line.split(',')
                feature.pop()  # abandon last element, because it is label
                data.append(list(map(float, feature)))  # append to data

    return np.array(data)[:, 0:dim]


def load_test_data(path, type_num=0, minIndex=0, maxIndex=150, dim=4):
    """
    :param path:数据集路径
    :param type_num: 训练集的类型,范围[0, 2]
    :param minIndex: 测试集的范围[minIndex, maxIndex)
    :param maxIndex: 测试集的范围[minIndex, maxIndex)
    :return: 测试集,标签
    """
    data = []
    label = []

    i = -1
    with open(path) as f:
        for line in f.readlines():  # get data by lines
            if line == "\n":  # empty line
                break
            i += 1
            if minIndex <= i < maxIndex:
                feature = line.split(',')
                feature.pop()  # abandon last element, because it is label
                data.append(list(map(float, feature)))  # append to data

                if type_num * 50 <= i < (type_num + 1) * 50:
                    label.append(1)
                else:
                    label.append(0)

    return np.array(data)[:, 0:dim], label

2. 设置模型的参数

    type_num = 0
    dim = 4
    C = 0.1 #0.6
    toler = 0.0001
    maxIter = 40

    best_acc = 0
    best_a = 0
    best_r = 0
    best_label = []

3. 加载模型的所需数据

train_data = load_train_data('iris.data',
             0, 30, dim=dim)
test_data, correct_label = load_test_data(
            'iris.data', type_num, 30, 150, dim=dim)
train_data.shape, test_data.shape, len(correct_label), correct_label[:35], 
# correct_label[-5:]

二、 SVDD算法

a为超球中心,R为半径。

import numpy as np
import random


def meet_limit_condition(alpha_i, data_i, a, R, C, toler):
    """
    测试alphas[i]是否满足优化条件
    :param alpha_i:alphas[i]
    :param data_i:data_array[i]
    :param a:中心点
    :param R:半径
    :param C:惩罚因子
    :param toler:容忍度
    :return:满足优化条件则返回True,否则返回False
    """
    # if abs(R ** 2 - np.dot((data_i - a), (data_i - a))) > toler and 0 < alpha_i < C:
    Ei = R ** 2 - np.dot((data_i - a), (data_i - a))
    if (Ei < -toler and alpha_i < C) or (Ei > toler and alpha_i > 0):
        return True
    else:
        return False


def selectJrand(i, m):
    """
    随机选择一个整数
    Args:
        i  第一个alpha的下标
        m  所有alpha的数目
    Returns:
        j  返回一个不为i的随机数,在0~m之间的整数值
    """
    j = i
    while j == i:
        j = int(random.uniform(0, m))
    return j


def calculate_alpha_j(data_array, alphas, i, j, a):
    """
    data_array: 测试集
    alphas: 旧的alphas值
    i, j: 当前选出的将要进行优化的alpha的下标
    返回值: 新的alphas[j]值
    """

    a1 = np.array(a)
    x1 = np.array(data_array[i])
    x2 = np.array(data_array[j])

    x12 = np.dot(x1, x2)
    x1_2 = np.dot(x1, x1)
    x2_2 = np.dot(x2, x2)

    nu = np.dot(a1, x2) - x2_2 - np.dot(a1, x1) + x1_2 + \
        alphas[i] * (x12 + x1_2) + alphas[j] * (x1_2 - x2_2 + 3 * x12)
    de = 2 * (x1_2 + x2_2 - 2 * x12)

    if de == 0:
        return 0, False

    return -nu / de, True


def calculate_alpha_i(alphas, i):
    """
    alphas: 新的alpha数组
    i: 要更新的alpha值的下标
    返回值: 新的alphas[i]
    """
    alpha_sum = alphas.sum() - alphas[i]
    return 1 - alpha_sum


def smo(train_data, C=0.6, toler=0.001, maxIter=40):
    data_array = np.array(train_data)
    m, n = np.shape(data_array)

    alphas = np.array([1 / m] * m)
    R = 0

    a = np.array([0.0] * n)
    for i in range(m):
        a += alphas[i] * data_array[i]

    iter = 0
    while iter < maxIter:
        changed_flag = 0
        for i in range(m):
            if meet_limit_condition(alphas[i], data_array[i], a, R, C, toler):
                j = selectJrand(i, m)

                L = max(0, alphas[i] + alphas[j] - C)
                H = min(C, alphas[i] + alphas[j])
                if L == H:
                    continue

                new_alpha_j, valid = calculate_alpha_j(
                    data_array, alphas, i, j, a)
                if not valid:
                    continue

                if new_alpha_j < L:
                    new_alpha_j = L
                elif new_alpha_j > H:
                    new_alpha_j = H

                if abs(new_alpha_j - alphas[j]) < 0.001:
                    continue
                else:
                    alphas[j] = new_alpha_j
                    alphas[i] = calculate_alpha_i(alphas, i)
                    changed_flag += 1

                # check_alphas(alphas, C)

                a, R = calculate_a_and_R(data_array, alphas, i, j, C)

        if changed_flag == 0:
            iter += 1
        else:
            iter = 0

    return a, R


def check_alphas(alphas, C):
    """
    检测alphas是否符合要求
    :param alphas:alphas
    :param C:惩罚因子
    :return:符合返回True,否则返回False
    """
    a_sum = 0
    for i in range(alphas.shape[0]):
        if alphas[i] < -0.0001:
            print("alphas" + str(i) + ":" + str(alphas[i]) + " < 0")
        if alphas[i] > C + 0.0001:
            print("alphas" + str(i) + ":" + str(alphas[i]) + " > C")
        a_sum += alphas[i]

    if abs(a_sum - 1) > 0.0001:
        print("alphas sum != 1")
        return False
    else:
        return True


def calculate_a_and_R(data_array, alphas, i, j, C):
    """
    计算a, R
    :param data_array:
    :param alphas:
    :param i:
    :param j:
    :param C:
    :return:
    """
    m, n = np.shape(data_array)
    a = [0] * n
    for l in range(m):
        a += data_array[l] * alphas[l]

    R1 = np.sqrt(np.dot(data_array[i] - a, data_array[i] - a))
    R2 = np.sqrt(np.dot(data_array[j] - a, data_array[j] - a))
    if 0 < alphas[i] < C:
        R = R1
    elif 0 < alphas[j] < C:
        R = R2
    else:
        R = (R1 + R2) / 2.0

    return a, R

三、 测试与结果

代码如下:

# import load_data
# import one_class_svm
import numpy as np
from matplotlib import pyplot as plt


def judge(test_data, a, R):
    m, n = np.shape(test_data)
    label = []
    for i in range(m):
        if np.dot(test_data[i] - a, test_data[i] - a) <= R ** 2:
            label.append(1)
        else:
            label.append(0)

    return label


def calculate_acc(result_label, correct_label):
    """
    返回result_label与correct_label相同的比例
    :return: acc = (true positive + false positive)/all
    """
    if len(result_label) != len(correct_label):
        return -1

    acc = 0
    for i in range(len(result_label)):
        if result_label[i] == correct_label[i]:
            acc += 1

    return acc / len(result_label)


def draw_picture(train_data, test_data, correct_label, a, R, C, toler, acc):
    plt.figure()
    plt.scatter(test_data[:, 0], test_data[:, 1], c=correct_label)
    plt.scatter(train_data[:, 0], train_data[:, 1], c='r')
    plt.title("C = " + str(C) + " toler = " + str(toler) +
              " R = " + str(R)[0:4] + " acc = " + str(acc)[:4])
    theta = np.arange(0, 2 * np.pi, 0.01)
    x = a[0] + R * np.cos(theta)
    y = a[1] + R * np.sin(theta)

    plt.plot(x, y)
    plt.show()

以鸢尾花数据为例进行举例:

def adjust_param():
    # 以第几类数据作为训练集
    type_num = 0
    dim = 4
    C = 0.04 #0.6
    toler = 0.0001
    maxIter = 40

    best_acc = 0
    best_a = 0
    best_r = 0
    best_label = []

    # 数据预处理
    if type_num == 0:
        train_data = load_train_data('data', 0, 30, dim=dim)
        test_data, correct_label = load_test_data(
            'data', type_num, 30, 150, dim=dim)
    elif type_num == 1:
        train_data = load_train_data('data', 50, 80)

        test_data1, correct_label1 = load_test_data(
            'data', type_num, 80, 150, dim=dim)
        test_data2, correct_label2 = load_test_data(
            'data', type_num, 0, 50, dim=dim)

        test_data = np.vstack((test_data1, test_data2))
        correct_label = np.hstack((correct_label1, correct_label2))
    elif type_num == 2:
        train_data = load_train_data('data', 100, 130, dim=dim)

        test_data1, correct_label1 = load_test_data(
            'data', type_num, 130, 150, dim=dim)
        test_data2, correct_label2 = load_test_data(
            'data', type_num, 0, 100, dim=dim)

        test_data = np.vstack((test_data1, test_data2))
        correct_label = np.hstack((correct_label1, correct_label2))

    min_acc = 2
    avrg_acc = 0
    max_acc = -1
    for i in range(50):
        a, R = smo(train_data, C, toler, maxIter)
        result_label = judge(test_data, a, R)
        acc = calculate_acc(result_label, correct_label)
        if acc > best_acc:
            best_acc = acc
            best_a = a
            best_r = R
            best_label = result_label

        avrg_acc += acc

        if acc < min_acc:
            min_acc = acc
        if acc > max_acc:
            max_acc = acc

        #print("accuracy: " + str(acc))

    avrg_acc /= 100
    print("train type:" + str(type_num) + ", dim=" +
          str(dim) + " => best acc = " + str(max_acc))
    print("model: a="+str(best_a)+", R="+str(best_r) + ",C="+str(C))
    print("label(0-20:positive sample):")
    print(best_label)
    draw_picture(train_data, test_data, correct_label,
                 best_a, best_r, C, toler, best_acc)


if __name__ == "__main__":
    adjust_param()

输出结果表示如下:

对其中两个维度分类结果如下:

分类准确度还是较高的,该结果若提升,可在核函和优化方法上进行改进,这里不继续写了。

上述代码基本可实现svdd基础功能,若需要整体源码,可以考虑上传。

 

  • 9
    点赞
  • 93
    收藏
    觉得还不错? 一键收藏
  • 11
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 11
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值