1. 说明
Faiss中有大量不同的Index适用于各种不同的场景,但是步骤无非都是三步:训练、构造索引和查询。所以无论研究那种Index类型都可以遵循上述三个内容做具体研究。
由于我实际项目中使用的是IndexIVFPQ进行图搜索,所以这里针对这一类型进行分析。
1.1 IndexIVFPQ关系图
2. 类
以下代码基于Faiss-1.6.1版本。
2.1 基类 Index
这是所有索引的抽象基类,它定义了索引基本的函数和方法,但是只是一些虚函数,即这些函数大多会在实际类型的索引子类中重新定义。
// index.h
struct Index {
using idx_t = int64_t; // 定义faiss内常用数据类型
using component_t = float;
using distance_t = float;
int d; ///< 向量维度
idx_t ntotal; ///< 索引中向量的个数,指add之后索引中的向量
bool verbose; ///< bool型变量,为True时会打印大量log信息,默认为False
bool is_trained; /// 标志索引是否已经训练
MetricType metric_type; /// 搜索类型,L1、L2等。Faiss主要是L2搜索
float metric_arg; ///< argument of the metric type
// index类的构造函数
explicit Index (idx_t d = 0, MetricType metric = METRIC_L2):
d(d),
ntotal(0),
verbose(false),
is_trained(true),
metric_type (metric),
metric_arg(0) {}
virtual ~Index (); // 析构函数
// 对一组向量集进行训练, n为训练集中向量的个数; x为训练集首地址,大小是n * d * sizeof(float)
virtual void train(idx_t n, const float* x);
// 添加向量集到索引中,n为向量个数,x表示向量集首地址,向量集大小为n * d * sizeof(float)
virtual void add (idx_t n, const float *x) = 0;
// 与add函数一样,是将向量x添加到索引中,但是使用xids指定的ID,而不是默认的序列ID,这个函数不是所有的索引都支持
virtual void add_with_ids (idx_t n, const float * x, const idx_t *xids);
/** 从索引中查询n个d为向量的k个近邻向量
* @param x input vectors to search, size n * d
* @param labels output labels of the NNs, size n*k
* @param distances output pairwise distances, size n*k
*/
virtual void search (idx_t n, const float *x, idx_t k,
float *distances, idx_t *labels) const = 0;
/** 从索引中查询所有与目标向量距离小于radius的向量
* @param x input vectors to search, size n * d
* @param radius search radius
* @param result result table
*/
virtual void range_search (idx_t n, const float *x, float radius,
RangeSearchResult *result) const;
/** 作用于search相同,但是仅仅返回K个近邻的标签
* @param x input vectors to search, size n * d
* @param labels output labels of the NNs, size n*k
*/
void assign (idx_t n, const float * x, idx_t * labels, idx_t k = 1);
// 从索引中移除所有向量
virtual void reset() = 0;
// 从索引中移除指定id的成员,并返回该成员的ID
virtual size_t remove_ids (const IDSelector & sel);
/** 重建存储的向量
* @param key id of the vector to reconstruct
* @param recons reconstucted vector (size d)
*/
virtual void reconstruct (idx_t key, float * recons) const;
/** 重建 i0 到 i0 + ni - 1 的向量
* @param recons reconstucted vector (size ni * d)
*/
virtual void reconstruct_n (idx_t i0, idx_t ni, float *recons) const;
/** 与search功能相似,但是会重建搜索的结果向量
* @param recons reconstructed vectors size (n, k, d)
**/
virtual void search_and_reconstruct (idx_t n, const float *x, idx_t k,
float *distances, idx_t *labels,
float *recons) const;
/** 计算残差向量
* @param x input vector, size d
* @param residual output residual vector, size d
* @param key encoded index, as returned by search and assign
*/
virtual void compute_residual (const float * x,
float * residual, idx_t key) const;
/** 计算残差向量,相当于多次调用compute_residual()
* @param n number of vectors
* @param xs input vectors, size (n x d)
* @param residuals output residual vectors, size (n x d)
* @param keys encoded index, as returned by search and assign
*/
virtual void compute_residual_n (idx_t n, const float* xs,
float* residuals,
const idx_t* keys) const;
/** 获取此类索引的DistanceComputer对象 **/
virtual DistanceComputer * get_distance_computer() const;
/* The standalone codec interface */
/** 产生代码的大小,以字节为单位 */
virtual size_t sa_code_size () const;
/** 对一组向量进行编码
* @param n number of vectors
* @param x input vectors, size n * d
* @param bytes output encoded vectors, size n * sa_code_size()
*/
virtual void sa_encode (idx_t n, const float *x,
uint8_t *bytes) const;
/** 对一组向量进行解码
* @param n number of vectors
* @param bytes input encoded vectors, size n * sa_code_size()
* @param x output vectors, size n * d
*/
virtual void sa_decode (idx_t n, const uint8_t *bytes,
float *x) const;
};
2.2 Level1Quantizer 类
struct Level1Quantizer {
Index * quantizer; ///< 用于映射向量到倒序列表的量化器,虽然是Index*,但实际使用时可能是其他具体类
size_t nlist; ///< 聚类个数
/**
* = 0: use the quantizer as index in a kmeans training
* = 1: just pass on the training set to the train() of the quantizer
* = 2: kmeans training on a flat index + add the centroids to the quantizer
*/
char quantizer_trains_alone;
bool own_fields; ///< whether object owns the quantizer
ClusteringParameters cp; ///< to override default clustering params
Index *clustering_index; ///< to override index used during clustering
/// Trains the quantizer and calls train_residual to train sub-quantizers
void train_q1 (size_t n, const float *x, bool verbose,
MetricType metric_type);
/// compute the number of bytes required to store list ids
size_t coarse_code_size () const;
void encode_listno (Index::idx_t list_no, uint8_t *code) const;
Index::idx_t decode_listno (const uint8_t *code) const;
Level1Quantizer (Index * quantizer, size_t nlist);
Level1Quantizer ();
~Level1Quantizer ();
}
这个类封装IndexIVF所需的量化器类。这个类隔离与列表存储无关的字段,尤其是训练。
Faiss建立了3个level的量化编码,level1和level2是必须的,level3是可选的。
Level1Quantizer 是其中的level1量化编码。
2.3 IndexIVF 类
基于倒排序文件的索引。
struct IndexIVF: Index, Level1Quantizer {
InvertedLists *invlists;
bool own_invlists;
size_t code_size; ///< code size per vector in bytes
size_t nprobe; ///< number of probes at query time
size_t max_codes; ///< max nb of codes to visit to do a query
int parallel_mode;
/// map for direct access to the elements. Enables reconstruct().
bool maintain_direct_map;
std::vector <idx_t> direct_map;
IndexIVF (Index * quantizer, size_t d,
size_t nlist, size_t code_size,
MetricType metric = METRIC_L2);
virtual void train_residual (idx_t n, const float *x);
virtual void search_preassigned (idx_t n, const float *x, idx_t k,
const idx_t *assign,
const float *centroid_dis,
float *distances, idx_t *labels,
bool store_pairs,
const IVFSearchParameters *params=nullptr
) const;
void range_search_preassigned(idx_t nx, const float *x, float radius,
const idx_t *keys, const float *coarse_dis,
RangeSearchResult *result) const;
/// get a scanner for this index (store_pairs means ignore labels)
virtual InvertedListScanner *get_InvertedListScanner (
bool store_pairs=false) const;
virtual void reconstruct_from_offset (int64_t list_no, int64_t offset,
float* recons) const;
void check_compatible_for_merge (const IndexIVF &other) const;
virtual void merge_from (IndexIVF &other, idx_t add_id);
/** copy a subset of the entries index to the other index */
virtual void copy_subset_to (IndexIVF & other, int subset_type,
idx_t a1, idx_t a2) const;
~IndexIVF() override;
size_t get_list_size (size_t list_no) const
{ return invlists->list_size(list_no); }
/** intialize a direct map */
void make_direct_map (bool new_maintain_direct_map=true);
...
IndexIVF ();
}
在倒排序文件中,量化器(Index实例)为要添加的每个向量提供量化索引。量化索引映射到一个列表(又名倒排列表或发布列表),在该列表中存储向量的ID。
倒排序列表仅在训练后才会使用到,如果没有在外部设置,则将自动使用ArrayInvertedLists。
在搜索时,搜索向量也会被量化,并且从nlist个聚类中选择nprobe个聚类进行搜索以提升搜索时间。
2.4 IndexIVFPQ 类
IndexIVFPQ会应用到K-means聚类中心算法,PCA降维和PQ量化乘积等算法对向量集进行压缩、编码。
具有Product Quantizer编码的倒序文件。每个残差向量被编码为乘积量化器代码。
struct IndexIVFPQ: IndexIVF {
bool by_residual; ///< Encode residual or plain vector?
ProductQuantizer pq; ///< produces the codes
bool do_polysemous_training; ///< reorder PQ centroids after training?
PolysemousTraining *polysemous_training; ///< if NULL, use default
// search-time parameters
size_t scan_table_threshold; ///< use table computation or on-the-fly?
int polysemous_ht; ///< Hamming thresh for polysemous filtering
precomputed_table; ///< if by_residual, build precompute tables
static size_t precomputed_table_max_bytes;
std::vector <float> precomputed_table;
IndexIVFPQ (
Index * quantizer, size_t d, size_t nlist,
size_t M, size_t nbits_per_idx);
...
/// trains the product quantizer
void train_residual(idx_t n, const float* x) override;
/// same as train_residual, also output 2nd level residuals
void train_residual_o (idx_t n, const float *x, float *residuals_2);
void reconstruct_from_offset (int64_t list_no, int64_t offset,
float* recons) const override;
size_t find_duplicates (idx_t *ids, size_t *lims) const;
// map a vector to a binary code knowning the index
void encode (idx_t key, const float * x, uint8_t * code) const;
void encode_multiple (size_t n, idx_t *keys,
const float * x, uint8_t * codes,
bool compute_keys = false) const;
/// inverse of encode_multiple
void decode_multiple (size_t n, const idx_t *keys,
const uint8_t * xcodes, float * x) const;
InvertedListScanner *get_InvertedListScanner (bool store_pairs)
const override;
/// build precomputed table
void precompute_table ();
IndexIVFPQ ();
}
3. 其他相关类
在IndexIVFPQ的构造IndexIVFPQ类时,我使用的传递给IndexIVF -> Level1Quantizer -> quantizer的具体类型是IndexFlatL2,所以这里列一下此类的定义。
IndexFlatL2关系图如下图:
3.1 IndexFlat 类
Flat索引只是将向量简单的编码为固定大小的代码,并将它们存储在 ntotal * code_size的数组中,不对向量数据进行压缩或折叠等操作。这类索引继承自基类Index,会存储完整向量并执行详尽搜索的索引。
// IndexFlat.h
struct IndexFlat: Index {
std::vector<float> xb; /// database vectors, size ntotal * d
explicit IndexFlat (idx_t d, MetricType metric = METRIC_L2);
...
/** 根据向量子集查询距离
* @param x query vectors, size n * d
* @param labels indices of the vectors that should be compared
* for each query vector, size n * k
* @param distances
* corresponding output distances, size n * k
*/
void compute_distance_subset (
idx_t n,
const float *x,
idx_t k,
float *distances,
const idx_t *labels) const;
IndexFlat () {}
...
};
3.2 IndexFlatL2 类
struct IndexFlatL2:IndexFlat {
explicit IndexFlatL2 (idx_t d): IndexFlat (d, METRIC_L2) {}
IndexFlatL2 () {}
};
这个类继承自IndexFlat,基本上是完全使用了IndexFlat的函数和方法,只是声明了一个新的IndexFlatL2类的构造函数。这个类表示精确计算L2距离。