KNN for Multi-label learning

ML-kNN算法被调整以适应多标签学习任务,通过引入最大后验概率来预测未知实例的标签。这是一种懒惰学习算法,属于多标签学习的算法适应。该算法的工作流程如下:
摘要由CSDN通过智能技术生成

ML-kNN is adjusted for the multi-label learning tasks by introducing the maximum post probability to the prediction of unseen instances' labels. It is a lazy learning algorithm, and belong to the algorithm adaptation for multi-label learning. The algorithm can be described as follows:


# -*- coding: utf-8 -*-
"""
Created on 2017/4/4 8:50 2017

@author: Randolph.Lee
"""
from __future__ import division
from Evaluation_metrics import *
import scipy.io as scio
import numpy as np
import sys
import copy


class MLKNN:

    def __init__(self, num, smooth):
        """
        initialize some parameters of KNN
        :param num: Number of neighbors used in the k-nearest neighbor algorithm
        :param smooth: Smoothing parameter
        """
        self.num = num
        self.smooth = smooth
        self.output = None
        self.hamming_loss = 0.0
        self.ranking_loss = 0.0
        self.one_error = 0.0
        self.coverage = 0.0
        self.average_precision = 0.0

    @staticmethod
    def compute_distance(matrix_a, matrix_b):
        """
        compute the distance between two different matrix
        :param matrix_a: the first matrix
        :param matrix_b: the third matrix
        :return: the distance matrix between matrix_a and matrix_b
        """
        num_instance0 = len(matrix_a)
        num_instance1 = len(matrix_b)
        distance_matrix = np.zeros((num_instance0, num_instance1))
        judge = matrix_a == matrix_b
        for i in xrange(num_instance0):
            for j in xrange(num_instance1):
                distance_matrix[i, j] = np.sqrt(np.power(matrix_a[i, :] - matrix_b[j, :], 2).sum())
            if not isinstance(judge, bool):
                distance_matrix[i, i] = sys.maxint
        return distance_matrix

    def train_knn(self, training_target, dis_mat):
        """
        compute the terms related to training instances for the test step
        :param training_target: the label set of training instances
        :param dis_mat: distance matrix between training instances
        :return: prior_pos, prior_neg, condition_prob, condition_probN, num_class
        """
        num_instance, num_class = training_target.shape
        '''compute the prior positive probability and negative probability of training instances'''
        prior_pos = np.zeros((1, num_class))
        for i in xrange(num_class):
            temp = sum(training_target[:, i] == 1)
            prior_pos[0, i] = (self.smooth + temp) / (self.smooth * 2 + num_instance)
        prior_neg = 1 - prior_pos
        
        '''find the k neighbors of each training instance'''
        neighbors = [np.argsort(dis_mat[i, :])[:self.num] for i in xrange(num_instance)]
        
        '''The number of instances belong to the ith class which have k nearest neighbors in Ci is stored'''
        '''The number of instances not belong to the ith class which have k nearest neighbors in NCi is stored'''
        num_Ci = np.zeros((num_class, self.num + 1))
        num_NCi = np.zeros((num_class, self.num + 1))
        for i in xrange(num_instance):
            '''The number of the Num nearest neighbors of the ith instance which belong to the jth class->temp[j]'''
            neighbor_labels = np.array([training_target[neighbors[i][j], :] for j in xrange(self.num)])
            temp = [sum(neighbor_labels[:, j] == 1) for j in xrange(num_class)]
            for j in xrange(num_class):
                if training_target[i, j] == 1:
                    num_Ci[j, temp[j]] += 1
                else:
                    num_NCi[j, temp[j]] += 1
        
        '''condition_prob - A Qx(Num+1) array, for the ith class Ci, the probability of P(k|Ci) (0<=k<=Num)
        i.e. k nearest neighbors of an instance in Ci will belong to Ci , is stored in Cond(i,k+1)
        condition_probN- A Qx(Num+1) array, for the ith class Ci, the probability of P(k|~Ci) (0<=k<=Num)
        i.e. k nearest neighbors of an instance not in Ci will belong to Ci, is stored in CondN(i,k+1)'''
        condition_prob = np.zeros((num_class, self.num + 1))
        condition_probN = np.zeros((num_class, self.num + 1))
        for i in xrange(num_class):
            for j in xrange(self.num + 1):
                condition_prob[i, j] = (self.smooth + num_Ci[i, j]) / (self.smooth * (self.num + 1) + num_Ci[i, :].sum())
                condition_probN[i, j] = (self.smooth + num_NCi[i, j]) / (self.smooth * (self.num + 1) + num_NCi[i, :].sum())
        return prior_pos, prior_neg, condition_prob, condition_probN, num_class

    def test_knn(self, train_data, train_target, test_data):
        """
        compute the output
        :param train_data: training instances
        :param train_target: label sets of training instances
        :param test_data: test instances
        :return: None
        """
        dis_train = self.compute_distance(train_data, train_data)
        prior_pos, prior_neg, cond_prob, cond_probN, num_class = self.train_knn(train_target, dis_train)
        dis_train_test = self.compute_distance(test_data, train_data)
        '''find the k neighbors of each testing instance'''
        num_testing = len(test_data)
        neighbors = [np.argsort(dis_train_test[i, :])[:self.num] for i in xrange(num_testing)]
        '''compute the outputs'''
        outputs = np.zeros((num_testing, num_class))
        for i in xrange(num_testing):
            neighbor_labels = np.array([train_target[neighbors[i][j], :] for j in xrange(self.num)])
            temp = [sum(neighbor_labels[:, j] == 1) for j in xrange(num_class)]
            for j in xrange(num_class):
                prob_in = prior_pos[0, j] * cond_prob[j, temp[j]]
                prob_out = prior_neg[0, j] * cond_probN[j, temp[j]]
                if prob_in + prob_out == 0:
                    outputs[i, j] = prior_pos[0, j]
                else:
                    outputs[i, j] = prob_in / (prob_in + prob_out)
        self.output = outputs

    def evaluation(self, test_target):
        """
        compute the different evaluation indicators
        :param test_target: the label set of testing instances
        :return: None
        """
        pre_labels = copy.deepcopy(self.output)
        pre_labels[pre_labels > 0.5] = 1
        pre_labels[pre_labels <= 0.5] = 0
        self.hamming_loss = compute_hamming_loss(pre_labels, test_target)
        self.ranking_loss = compute_ranking_loss(self.output, test_target)
        self.one_error = compute_one_error(self.output, test_target)
        self.coverage = compute_coverage(self.output, test_target)
        self.average_precision = compute_average_precision(self.output, test_target)


if __name__ == "__main__":
    # read data from dat files
    path = r"D:\Randolph\Learning Programming\myPython\Thesis_reading\datasets\sample_data.mat"
    data = scio.loadmat(path)
    # set the basic parameters
    num = 10
    smooth = 1
    ml_knn = MLKNN(num, smooth)
    ml_knn.test_knn(train_data=data["train_data"], train_target=data["train_target"].transpose(), test_data=data["test_data"])
    ml_knn.evaluation(test_target=data["test_target"].transpose())
    # print the result of ML_KNN
    print "average_precision: ", ml_knn.average_precision
    print "ranking_loss: ", ml_knn.ranking_loss
    print "hamming_loss: ", ml_knn.hamming_loss
    print "coverage: ", ml_knn.coverage
    output = ml_knn.output

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值