Faiss(6)：IndexIVFPQ类定义

最新推荐文章于 2024-01-05 13:36:34 发布

翔底

最新推荐文章于 2024-01-05 13:36:34 发布

阅读量1.8k

点赞数

分类专栏： Faiss 文章标签： c++

本文链接：https://blog.csdn.net/rangfei/article/details/108416731

版权

Faiss 专栏收录该内容

17 篇文章 42 订阅

订阅专栏

1. 说明

Faiss中有大量不同的Index适用于各种不同的场景，但是步骤无非都是三步：训练、构造索引和查询。所以无论研究那种Index类型都可以遵循上述三个内容做具体研究。

由于我实际项目中使用的是IndexIVFPQ进行图搜索，所以这里针对这一类型进行分析。

1.1 IndexIVFPQ关系图

在这里插入图片描述

2. 类

以下代码基于Faiss-1.6.1版本。

2.1 基类 Index

这是所有索引的抽象基类，它定义了索引基本的函数和方法，但是只是一些虚函数，即这些函数大多会在实际类型的索引子类中重新定义。

// index.h
struct Index {
    using idx_t = int64_t;  // 定义faiss内常用数据类型
    using component_t = float;
    using distance_t = float;

    int d;                 ///< 向量维度
    idx_t ntotal;          ///< 索引中向量的个数，指add之后索引中的向量
    bool verbose;          ///< bool型变量，为True时会打印大量log信息，默认为False

    bool is_trained;      /// 标志索引是否已经训练

    MetricType metric_type;    /// 搜索类型，L1、L2等。Faiss主要是L2搜索
    float metric_arg;     ///< argument of the metric type

    // index类的构造函数
    explicit Index (idx_t d = 0, MetricType metric = METRIC_L2):
                    d(d),
                    ntotal(0),
                    verbose(false),
                    is_trained(true),
                    metric_type (metric),
                    metric_arg(0) {}

    virtual ~Index ();    // 析构函数
    
    // 对一组向量集进行训练， n为训练集中向量的个数; x为训练集首地址，大小是n * d * sizeof(float)
    virtual void train(idx_t n, const float* x);

    // 添加向量集到索引中，n为向量个数，x表示向量集首地址，向量集大小为n * d * sizeof(float)
    virtual void add (idx_t n, const float *x) = 0;

    // 与add函数一样，是将向量x添加到索引中，但是使用xids指定的ID，而不是默认的序列ID，这个函数不是所有的索引都支持
    virtual void add_with_ids (idx_t n, const float * x, const idx_t *xids);

    /** 从索引中查询n个d为向量的k个近邻向量
     * @param x           input vectors to search, size n * d
     * @param labels      output labels of the NNs, size n*k
     * @param distances   output pairwise distances, size n*k
     */
    virtual void search (idx_t n, const float *x, idx_t k,
                         float *distances, idx_t *labels) const = 0;

    /** 从索引中查询所有与目标向量距离小于radius的向量
     * @param x           input vectors to search, size n * d
     * @param radius      search radius
     * @param result      result table
     */
    virtual void range_search (idx_t n, const float *x, float radius,
                               RangeSearchResult *result) const;

    /** 作用于search相同，但是仅仅返回K个近邻的标签
     * @param x           input vectors to search, size n * d
     * @param labels      output labels of the NNs, size n*k
     */
    void assign (idx_t n, const float * x, idx_t * labels, idx_t k = 1);

    // 从索引中移除所有向量
    virtual void reset() = 0;

    // 从索引中移除指定id的成员，并返回该成员的ID
    virtual size_t remove_ids (const IDSelector & sel);

    /** 重建存储的向量
     * @param key         id of the vector to reconstruct
     * @param recons      reconstucted vector (size d)
     */
    virtual void reconstruct (idx_t key, float * recons) const;

    /** 重建 i0 到 i0 + ni - 1 的向量
     * @param recons      reconstucted vector (size ni * d)
     */
    virtual void reconstruct_n (idx_t i0, idx_t ni, float *recons) const;

    /** 与search功能相似，但是会重建搜索的结果向量
     * @param recons      reconstructed vectors size (n, k, d)
     **/
    virtual void search_and_reconstruct (idx_t n, const float *x, idx_t k,
                                         float *distances, idx_t *labels,
                                         float *recons) const;

    /** 计算残差向量
     * @param x           input vector, size d
     * @param residual    output residual vector, size d
     * @param key         encoded index, as returned by search and assign
     */
    virtual void compute_residual (const float * x,
                                   float * residual, idx_t key) const;

    /** 计算残差向量，相当于多次调用compute_residual()
     * @param n           number of vectors
     * @param xs          input vectors, size (n x d)
     * @param residuals   output residual vectors, size (n x d)
     * @param keys        encoded index, as returned by search and assign
     */
    virtual void compute_residual_n (idx_t n, const float* xs,
                                     float* residuals,
                                     const idx_t* keys) const;

    /** 获取此类索引的DistanceComputer对象 **/
    virtual DistanceComputer * get_distance_computer() const;

    /* The standalone codec interface */

    /** 产生代码的大小，以字节为单位 */
    virtual size_t sa_code_size () const;

    /** 对一组向量进行编码
     * @param n       number of vectors
     * @param x       input vectors, size n * d
     * @param bytes   output encoded vectors, size n * sa_code_size()
     */
    virtual void sa_encode (idx_t n, const float *x,
                                  uint8_t *bytes) const;

    /** 对一组向量进行解码
     * @param n       number of vectors
     * @param bytes   input encoded vectors, size n * sa_code_size()
     * @param x       output vectors, size n * d
     */
    virtual void sa_decode (idx_t n, const uint8_t *bytes,
                                    float *x) const;
};

2.2 Level1Quantizer 类

struct Level1Quantizer {
    Index * quantizer;        ///< 用于映射向量到倒序列表的量化器，虽然是Index*，但实际使用时可能是其他具体类
    size_t nlist;             ///< 聚类个数


    /**
     * = 0: use the quantizer as index in a kmeans training
     * = 1: just pass on the training set to the train() of the quantizer
     * = 2: kmeans training on a flat index + add the centroids to the quantizer
     */
    char quantizer_trains_alone;
    bool own_fields;          ///< whether object owns the quantizer

    ClusteringParameters cp; ///< to override default clustering params
    Index *clustering_index; ///< to override index used during clustering

    /// Trains the quantizer and calls train_residual to train sub-quantizers
    void train_q1 (size_t n, const float *x, bool verbose,
                   MetricType metric_type);


    /// compute the number of bytes required to store list ids
    size_t coarse_code_size () const;
    void encode_listno (Index::idx_t list_no, uint8_t *code) const;
    Index::idx_t decode_listno (const uint8_t *code) const;

    Level1Quantizer (Index * quantizer, size_t nlist);

    Level1Quantizer ();

    ~Level1Quantizer ();

}

这个类封装IndexIVF所需的量化器类。这个类隔离与列表存储无关的字段，尤其是训练。

Faiss建立了3个level的量化编码，level1和level2是必须的，level3是可选的。

Level1Quantizer 是其中的level1量化编码。

2.3 IndexIVF 类

基于倒排序文件的索引。

struct IndexIVF: Index, Level1Quantizer {
    InvertedLists *invlists;
    bool own_invlists;

    size_t code_size;              ///< code size per vector in bytes

    size_t nprobe;            ///< number of probes at query time
    size_t max_codes;         ///< max nb of codes to visit to do a query

    int parallel_mode;

    /// map for direct access to the elements. Enables reconstruct().
    bool maintain_direct_map;
    std::vector <idx_t> direct_map;

    IndexIVF (Index * quantizer, size_t d,
              size_t nlist, size_t code_size,
              MetricType metric = METRIC_L2);

    virtual void train_residual (idx_t n, const float *x);

    virtual void search_preassigned (idx_t n, const float *x, idx_t k,
                                     const idx_t *assign,
                                     const float *centroid_dis,
                                     float *distances, idx_t *labels,
                                     bool store_pairs,
                                     const IVFSearchParameters *params=nullptr
                                     ) const;

    void range_search_preassigned(idx_t nx, const float *x, float radius,
                                  const idx_t *keys, const float *coarse_dis,
                                  RangeSearchResult *result) const;

    /// get a scanner for this index (store_pairs means ignore labels)
    virtual InvertedListScanner *get_InvertedListScanner (
        bool store_pairs=false) const;

    virtual void reconstruct_from_offset (int64_t list_no, int64_t offset,
                                          float* recons) const;


    void check_compatible_for_merge (const IndexIVF &other) const;

    virtual void merge_from (IndexIVF &other, idx_t add_id);

    /** copy a subset of the entries index to the other index   */
    virtual void copy_subset_to (IndexIVF & other, int subset_type,
                                 idx_t a1, idx_t a2) const;

    ~IndexIVF() override;

    size_t get_list_size (size_t list_no) const
    { return invlists->list_size(list_no); }

    /** intialize a direct map   */
    void make_direct_map (bool new_maintain_direct_map=true);
    ...

    IndexIVF ();
}

在倒排序文件中，量化器（Index实例）为要添加的每个向量提供量化索引。量化索引映射到一个列表（又名倒排列表或发布列表），在该列表中存储向量的ID。

倒排序列表仅在训练后才会使用到，如果没有在外部设置，则将自动使用ArrayInvertedLists。

在搜索时，搜索向量也会被量化，并且从nlist个聚类中选择nprobe个聚类进行搜索以提升搜索时间。

2.4 IndexIVFPQ 类

IndexIVFPQ会应用到K-means聚类中心算法，PCA降维和PQ量化乘积等算法对向量集进行压缩、编码。

具有Product Quantizer编码的倒序文件。每个残差向量被编码为乘积量化器代码。

struct IndexIVFPQ: IndexIVF {
    bool by_residual;              ///< Encode residual or plain vector?

    ProductQuantizer pq;           ///< produces the codes

    bool do_polysemous_training;   ///< reorder PQ centroids after training?
    PolysemousTraining *polysemous_training; ///< if NULL, use default

    // search-time parameters
    size_t scan_table_threshold;   ///< use table computation or on-the-fly?
    int polysemous_ht;             ///< Hamming thresh for polysemous filtering
    precomputed_table;     ///< if by_residual, build precompute tables
    static size_t precomputed_table_max_bytes;
    std::vector <float> precomputed_table;

    IndexIVFPQ (
            Index * quantizer, size_t d, size_t nlist,
            size_t M, size_t nbits_per_idx);

    ...
    /// trains the product quantizer
    void train_residual(idx_t n, const float* x) override;

    /// same as train_residual, also output 2nd level residuals
    void train_residual_o (idx_t n, const float *x, float *residuals_2);

    void reconstruct_from_offset (int64_t list_no, int64_t offset,
                                  float* recons) const override;

    size_t find_duplicates (idx_t *ids, size_t *lims) const;

    // map a vector to a binary code knowning the index
    void encode (idx_t key, const float * x, uint8_t * code) const;

    void encode_multiple (size_t n, idx_t *keys,
                          const float * x, uint8_t * codes,
                          bool compute_keys = false) const;

    /// inverse of encode_multiple
    void decode_multiple (size_t n, const idx_t *keys,
                          const uint8_t * xcodes, float * x) const;

    InvertedListScanner *get_InvertedListScanner (bool store_pairs)
        const override;

    /// build precomputed table
    void precompute_table ();

    IndexIVFPQ ();

}

3. 其他相关类

在IndexIVFPQ的构造IndexIVFPQ类时，我使用的传递给IndexIVF -> Level1Quantizer -> quantizer的具体类型是IndexFlatL2，所以这里列一下此类的定义。

IndexFlatL2关系图如下图：
在这里插入图片描述

3.1 IndexFlat 类

Flat索引只是将向量简单的编码为固定大小的代码，并将它们存储在 ntotal * code_size的数组中，不对向量数据进行压缩或折叠等操作。这类索引继承自基类Index，会存储完整向量并执行详尽搜索的索引。

// IndexFlat.h

struct IndexFlat: Index {
    std::vector<float> xb;    /// database vectors, size ntotal * d

    explicit IndexFlat (idx_t d, MetricType metric = METRIC_L2);
    ...

    /** 根据向量子集查询距离
     * @param x       query vectors, size n * d
     * @param labels  indices of the vectors that should be compared
     *                for each query vector, size n * k
     * @param distances
     *                corresponding output distances, size n * k
     */
    void compute_distance_subset (
            idx_t n,
            const float *x,
            idx_t k,
            float *distances,
            const idx_t *labels) const;

    IndexFlat () {}
    ...
};

3.2 IndexFlatL2 类

struct IndexFlatL2:IndexFlat {
    explicit IndexFlatL2 (idx_t d): IndexFlat (d, METRIC_L2) {}
    IndexFlatL2 () {}
};

这个类继承自IndexFlat，基本上是完全使用了IndexFlat的函数和方法，只是声明了一个新的IndexFlatL2类的构造函数。这个类表示精确计算L2距离。

翔底

关注

0
点赞
踩
6

收藏

觉得还不错? 一键收藏
打赏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录