Shark源码分析（十）：KNN算法_shark加密算法源码-CSDN博客

本文链接：https://blog.csdn.net/tuqinag/article/details/54743707

Shark源码分析（十）：KNN算法

关于这个算法，我之前已经有博客详细介绍过。虽然说这个算法看上去非常的简单，但是在搜索k个最近邻居数据点时，还是非常具有技巧性的。这里还是有必要再次强调一下。如果输入数据的维度不高，可以使用树形结构（kd树）来加快查找的速度。如果输入的维度较高，则利用树型结构的速度与计算两两数据间距离的速度并不会有太大的差别。之后我们要介绍的代码也是利用kd树来组织的。

在计算距离时，不仅可以选择欧几里得距离，同样可以选择基于核函数的距离。同样地，也有基于核函数距离的kd树。

BinaryTree类

这个类不是我们通常所认为的二叉树的结点类，而是表示binary space-partitioning tree 的结点。在每一个父结点处，表示将当前的空间分为两个子空间。这个分隔，不仅允许线性地分隔，同样也可以使用基于核函数的分隔。该类定义在<include/shark/Models/Trees/BinaryTree.h>。

template <class InputT>
class BinaryTree
{
public:
    typedef InputT value_type;

    BinaryTree(std::size_t size)
    : mep_parent(NULL)
    , mp_left(NULL)
    , mp_right(NULL)
    , mp_indexList(NULL)
    , m_size(size)
    , m_nodes(0)
    , m_threshold(0.0)
    {
        SHARK_ASSERT(m_size > 0);

        mp_indexList = new std::size_t[m_size];
        boost::iota(boost::make_iterator_range(mp_indexList,mp_indexList+m_size),0);
    }

    virtual ~BinaryTree()
    {
        if (mp_left != NULL) delete mp_left;
        if (mp_right != NULL) delete mp_right;
        if (mep_parent == NULL) delete [] mp_indexList;
    }

    BinaryTree* parent()
    { return mep_parent; }

    const BinaryTree* parent() const
    { return mep_parent; }

    bool hasChildren() const
    { return (mp_left != NULL); }

    bool isLeaf() const
    { return (mp_left == NULL); }

    BinaryTree* left()
    { return mp_left; }

    const BinaryTree* left() const
    { return mp_left; }

    BinaryTree* right()
    { return mp_right; }

    const BinaryTree* right() const
    { return mp_right; }

    std::size_t size() const
    { return m_size; }

    std::size_t nodes() const
    { return m_nodes; }

    std::size_t index(std::size_t point)const{
        return mp_indexList[point];
    }

    double distanceFromPlane(value_type const& point) const{
        return funct(point) - m_threshold;
    }

    double threshold() const{
        return m_threshold;
    }

    // 注意到，前面的left函数表示返回左孩子结点，而该函数的意思是
    // 查询结点是否位于左子空间内
    bool isLeft(value_type const& point) const
    { return (funct(point) < m_threshold); }

    bool isRight(value_type const& point) const
    { return (funct(point) >= m_threshold); }

    //如果计算距离时使用的是核函数，则返回核函数的对象
    virtual AbstractKernelFunction<value_type> const* kernel()const{
        //default is no kernel metric
        return NULL;
    }

    // 计算查询点与当前空间距离下界的平方
    // 灵活使用三角不等式，可以使这个界更紧，搜索的速度也更快
    virtual double squaredDistanceLowerBound(value_type const& point) const = 0;

protected:
    BinaryTree(BinaryTree* parent, std::size_t* list, std::size_t size)
    : mep_parent(parent)
    , mp_left(NULL)
    , mp_right(NULL)
    , mp_indexList(list)
    , m_size(size)
    , m_nodes(0)
    {}

    // 计算查询点与当前分隔平面的距离
    virtual double funct(value_type const& point) const = 0;

    // 将结点中的数据分开。并返回分隔点。
    // Range1表示具体的数据值，Range2表示具体的数据点
    template<class Range1, class Range2>
    typename boost::range_iterator<Range2>::type splitList (Range1& values, Range2& points){
        typedef typename boost::range_iterator<Range1>::type iterator1;
        typedef typename boost::range_iterator<Range2>::type iterator2;

        iterator1 valuesBegin = boost::begin(values);
        iterator1 valuesEnd = boost::end(values);

        //partitionEqually函数是将整个range划分为大小尽可能相等的两部分
        std::pair<iterator1, iterator2> splitpoint = partitionEqually(zipKeyValuePairs(values,points)).iterators();
        iterator1 valuesSplitpoint = splitpoint.first;
        iterator2 pointsSplitpoint = splitpoint.second;
        if (valuesSplitpoint == valuesEnd) {
            // partitioning failed, all values are equal :(
            m_threshold = *valuesBegin;
            return splitpoint.second;
        }

        // We don't want the threshold to be the value of an element but always in between two of them.
        // This ensures that no point of the training set lies on the boundary. This leeds to more stable
        // results. So we use the mean of the found splitpoint and the nearest point on the other side
        // of the boundary.
        double maximum = *std::max_element(valuesBegin, valuesSplitpoint);
        m_threshold = 0.5*(maximum + *valuesSplitpoint);

        return pointsSplitpoint;
    }

    //父结点指针
    BinaryTree* mep_parent;

    //左孩子结点指针
    BinaryTree* mp_left;

    //右孩子结点指针
    BinaryTree* mp_right;

    //存储当前结点中数据类标签的列表
    std::size_t* mp_indexList;

    //当前结点中数据的个数
    std::size_t m_size;

    //以当前结点为根节点的子树的结点个数
    std::size_t m_nodes;

    //分隔空间的阈值
    double m_threshold;

};

TreeConstruction类

这个类表示的是树构造的停止条件，停止条件可以是树的高度，或是叶子结点中包含数据的最小个数。该文件的定义位置与BinaryTree是一样的。

class TreeConstruction
{
public:
    TreeConstruction()
    : m_maxDepth(0xffffffff)
    , m_maxBucketSize(1)
    { }

    TreeConstruction(TreeConstruction const& other)
    : m_maxDepth(other.m_maxDepth)
    , m_maxBucketSize(other.m_maxBucketSize)
    { }

    TreeConstruction(unsigned int maxDepth, unsigned int maxBucketSize)
    : m_maxDepth(maxDepth ? maxDepth : 0xffffffff)
    , m_maxBucketSize(maxBucketSize ? maxBucketSize : 1)
    { }

    //使树的高度限制减1
    TreeConstruction nextDepthLevel() const
    { return TreeConstruction(m_maxDepth - 1, m_maxBucketSize); }

    unsigned int maxDepth() const
    { return m_maxDepth; }

    unsigned int maxBucketSize() const
    { return m_maxBucketSize; }

protected:
    //树的最大深度
    unsigned int m_maxDepth;

    //叶子就诶点钟所含数据的最小个数
    unsigned int m_maxBucketSize;
};

KDTree类

该类定义在<include/shark/Models/Trees/KDTree.h>中。

template <class InputT>
class KDTree : public BinaryTree<I