向量数据库:faiss的IndexPQ中PQ的图解+实现质心表的融合

IndexPQ

  • 一个indexPQ的简单示例:
import numpy as np
import faiss

# 生成一些随机数据作为示例
np.random.seed(42)
data = np.random.random((10000, 64)).astype('float32')

# 定义 PQ 索引的参数
m, nbits = 8, 8  # m: 子空间的数量, nbits: 每个子空间的比特数

# 创建 IndexPQ
index = faiss.IndexPQ(data.shape[1], m, nbits)

# 训练索引
index.train(data)

# 添加数据到索引
index.add(data)

# 查询示例
query_vector = np.random.random((1, 64)).astype('float32')
k = 5  # 获取前 k 个最近邻

# 进行查询
distances, indices = index.search(query_vector, k)

# 打印结果
print("Query Vector:")
print(query_vector)
print("\nIndices of Nearest Neighbors:")
print(indices)
print("\nDistances to Nearest Neighbors:")
print(distances)

在这里插入图片描述

参数解释
Mnumber of subquantizers ,输入向量被分为的片段的个数
dsubdimensionality of each subvector,每个子聚类表的长度
ksub = 2 n 2^n 2nnumber of centroids for each subquantizer,每个子聚类表的宽度

在这里插入图片描述

可通过faiss.vector_to_array(index.pq.centroids)查看质心表中的具体数值

  • index.train()之前:
    在这里插入图片描述
  • index.train()之后:
    在这里插入图片描述

实现“偷梁换柱”

import numpy as np
import faiss

# 生成一些随机数据作为示例
np.random.seed(42)
data = np.random.random((10000, 64)).astype('float32')
data2 = np.random.random((10000, 64)).astype('float32')

# 定义 PQ 索引的参数
m, nbits = 8, 8  # m: 子空间的数量, nbits: 每个子空间的比特数

# 创建 IndexPQ
index = faiss.IndexPQ(data.shape[1], m, nbits)
index.train(data)


index2 = faiss.IndexPQ(data.shape[1], m, nbits)
index2.train(data2)
index2.pq.centroids = index.pq.centroids

# 添加数据到索引
index.add(data)
index2.add(data)

# 查询示例
query_vector = np.random.random((1, 64)).astype('float32')
k = 5  # 获取前 k 个最近邻

# 进行查询
distances, indices = index.search(query_vector, k)
# 打印结果
print("Query Vector:")
print(query_vector)
print("\nIndices of Nearest Neighbors:")
print(indices)
print("\nDistances to Nearest Neighbors:")
print(distances)

distances, indices = index2.search(query_vector, k)
# 打印结果
print("Query Vector:")
print(query_vector)
print("\nIndices of Nearest Neighbors:")
print(indices)
print("\nDistances to Nearest Neighbors:")
print(distances)
# Query Vector:
# [[0.18171448 0.34181556 0.6398858  0.292473   0.44219118 0.63791186
#   0.19401862 0.17734843 0.26126006 0.38929975 0.02442818 0.72467136
#   0.9121011  0.0601452  0.42044804 0.56506294 0.9892394  0.2520515
#   0.12554157 0.3569948  0.7176223  0.6282157  0.53028387 0.19011611
#   0.8374111  0.91366297 0.6300717  0.21906242 0.34832168 0.6042122
#   0.55216706 0.15355448 0.47739747 0.07588766 0.45951515 0.46728414
#   0.8784772  0.2502514  0.8283812  0.77515835 0.7159397  0.6975115
#   0.24739715 0.89320683 0.07678613 0.7589492  0.29475844 0.8860514
#   0.8515612  0.9372315  0.5690415  0.02019571 0.78275704 0.02964665
#   0.36082503 0.22074123 0.4638003  0.3445418  0.8347299  0.3678306
#   0.00145097 0.44658396 0.02120558 0.74333763]]

# Indices of Nearest Neighbors:
# [[1356 3975 2011 5711 4734]]

# Distances to Nearest Neighbors:
# [[5.3155017 5.561659  5.6874743 5.7380037 5.762418 ]]
# Query Vector:
# [[0.18171448 0.34181556 0.6398858  0.292473   0.44219118 0.63791186
#   0.19401862 0.17734843 0.26126006 0.38929975 0.02442818 0.72467136
#   0.9121011  0.0601452  0.42044804 0.56506294 0.9892394  0.2520515
#   0.12554157 0.3569948  0.7176223  0.6282157  0.53028387 0.19011611
#   0.8374111  0.91366297 0.6300717  0.21906242 0.34832168 0.6042122
#   0.55216706 0.15355448 0.47739747 0.07588766 0.45951515 0.46728414
#   0.8784772  0.2502514  0.8283812  0.77515835 0.7159397  0.6975115
#   0.24739715 0.89320683 0.07678613 0.7589492  0.29475844 0.8860514
#   0.8515612  0.9372315  0.5690415  0.02019571 0.78275704 0.02964665
#   0.36082503 0.22074123 0.4638003  0.3445418  0.8347299  0.3678306
#   0.00145097 0.44658396 0.02120558 0.74333763]]

# Indices of Nearest Neighbors:
# [[1356 3975 2011 5711 4734]]

# Distances to Nearest Neighbors:
# [[5.3155017 5.561659  5.6874743 5.7380037 5.762418 ]]

另一种融合

import numpy as np
import faiss

# 定义 PQ 索引的参数
m, nbits = 8, 8  # m: 子空间的数量, nbits: 每个子空间的比特数

# 生成一些随机数据作为示例
np.random.seed(42)
data = np.random.random((10000, 64)).astype('float32')
data2 = np.random.random((10000, 64)).astype('float32')



# 创建 IndexPQ
index = faiss.IndexPQ(data.shape[1], m, nbits)
index.train(data)

index2 = faiss.IndexPQ(data.shape[1], m, nbits)
index2.train(data2)

index3 = faiss.IndexPQ(data.shape[1], m, nbits+1)# 312的融合
index3.train(data2)

# index2.pq.centroids = index.pq.centroids

faiss.copy_array_to_vector(
        np.hstack((
            faiss.vector_to_array(index.pq.centroids), 
            faiss.vector_to_array(index2.pq.centroids)
        )), 
        index3.pq.centroids
    )


# 添加数据到索引
index.add(data)
index2.add(data)
index3.add(data)

# 查询示例
query_vector = np.random.random((1, 64)).astype('float32')
k = 5  # 获取前 k 个最近邻

# 进行查询
distances, indices = index.search(query_vector, k)
# 打印结果
print("\nIndices of Nearest Neighbors:")
print(indices)
print("\nDistances to Nearest Neighbors:")
print(distances)

distances, indices = index2.search(query_vector, k)
# 打印结果
print("\nIndices of Nearest Neighbors:")
print(indices)
print("\nDistances to Nearest Neighbors:")
print(distances)


distances, indices = index3.search(query_vector, k)
# 打印结果
print("\nIndices of Nearest Neighbors:")
print(indices)
print("\nDistances to Nearest Neighbors:")
print(distances)


# Indices of Nearest Neighbors:
# [[1356 3975 2011 5711 4734]]

# Distances to Nearest Neighbors:
# [[5.3155017 5.561659  5.6874743 5.7380037 5.762418 ]]

# Indices of Nearest Neighbors:
# [[7929 4107  961 2473 4802]]

# Distances to Nearest Neighbors:
# [[5.2938    5.525796  5.57064   5.7225237 5.799486 ]]

# Indices of Nearest Neighbors:
# [[7929  961 8924 6034 7534]]

# Distances to Nearest Neighbors:
# [[5.272976 5.293335 5.605359 5.696639 5.707428]]
  • 我用以下代码进行了查询结果向量的输出,可见输出结果只有部分相同,这是因为我们修改(扩充)了距离计算的依赖。
# 打印最近邻居的向量
print("\nNearest Neighbors:")
for i in range(k):
    neighbor_index = indices[0, i]
    neighbor_vector = data[neighbor_index]
    print(f"Neighbor {i + 1}: Index {neighbor_index}, Distance {distances[0, i]}, Vector {neighbor_vector}")
Nearest Neighbors:
Neighbor 1: Index 1356, Distance 5.315501689910889, Vector [0.01101539 0.6567009  0.7633245  0.11660998 0.33732712 0.8499721
 0.68720007 0.30464375 0.7422429  0.88726753 0.30932006 0.6842837
 0.09341944 0.0586829  0.58625734 0.49242404 0.8100883  0.7802833
 0.2866956  0.5122624  0.7557766  0.27095273 0.36196133 0.05986348
 0.13048859 0.6102204  0.49675122 0.16859066 0.0072812  0.16903314
 0.7496399  0.09368231 0.40244937 0.23878902 0.54939663 0.51155233
 0.98295355 0.7728801  0.9383296  0.5779583  0.73778135 0.8950766
 0.041071   0.91545016 0.21177031 0.7050161  0.7733409  0.109326
 0.9530999  0.92655915 0.65455276 0.15532914 0.5660506  0.34414485
 0.9307643  0.40665573 0.69374937 0.6370151  0.2710153  0.53549683
 0.40998015 0.37462777 0.86400545 0.13975835]
Neighbor 2: Index 3975, Distance 5.56165885925293, Vector [0.6852252  0.79311645 0.3148995  0.42644194 0.43068996 0.21183491
 0.05787511 0.9602238  0.29530123 0.68910587 0.15870273 0.708609
 0.86639625 0.4510904  0.95853996 0.23694353 0.9699781  0.77007866
 0.48550996 0.40872052 0.46613166 0.24974766 0.01244073 0.43974018
 0.6752544  0.85017306 0.81168395 0.89650345 0.00525839 0.26145405
 0.16250415 0.26849723 0.01632813 0.28710592 0.73261696 0.00488606
 0.64295805 0.55107576 0.56322    0.731344   0.98232174 0.511173
 0.18898515 0.914521   0.59773636 0.7063284  0.73153925 0.97906655
 0.7590872  0.4468203  0.8288643  0.39922148 0.6796608  0.2297831
 0.6257001  0.5006799  0.8744495  0.14236866 0.12442626 0.14521043
 0.08433475 0.96692973 0.13060258 0.35526052]

Nearest Neighbors:
Neighbor 1: Index 7929, Distance 5.293799877166748, Vector [0.96233946 0.5737502  0.59273595 0.23098944 0.5369705  0.63797593
 0.42823425 0.24575251 0.8893288  0.54502964 0.8060116  0.65886575
 0.78253627 0.36670887 0.02456753 0.9354817  0.50337505 0.10899781
 0.2375323  0.617193   0.43202353 0.2877622  0.23769969 0.46321324
 0.54506296 0.92509645 0.6306161  0.29780295 0.4218431  0.03696149
 0.3116852  0.390165   0.9549252  0.3775373  0.5620233  0.9112755
 0.1394593  0.2466888  0.9241558  0.86005247 0.7937772  0.9627047
 0.09679138 0.8644842  0.071664   0.19625679 0.01667842 0.68986166
 0.71011275 0.7705593  0.67370415 0.07858868 0.4308906  0.09075476
 0.03766147 0.18467574 0.2782387  0.37127924 0.98378307 0.48489136
 0.22696696 0.07038712 0.22267212 0.10312359]
Neighbor 2: Index 4107, Distance 5.525795936584473, Vector [0.04460111 0.5836406  0.27762762 0.75389206 0.52659243 0.88937527
 0.5552024  0.43461925 0.12575674 0.29606643 0.19991362 0.86584586
 0.53224045 0.20149525 0.34396216 0.05069733 0.5733588  0.06891397
 0.55476147 0.6457947  0.6288594  0.30873945 0.02107575 0.02294
 0.05592747 0.21791738 0.37937504 0.93809557 0.72561693 0.70872927
 0.89278466 0.8034361  0.78736126 0.15266728 0.6486509  0.34981716
 0.91982204 0.00775846 0.585377   0.775304   0.5465568  0.80789727
 0.9480229  0.705922   0.7635816  0.4436006  0.7039021  0.7166679
 0.6503457  0.8582911  0.3602512  0.37543017 0.9880262  0.28702474
 0.14523816 0.4190667  0.86942685 0.15948081 0.83756304 0.5973361
 0.0859841  0.40533915 0.47337615 0.48650718]

Nearest Neighbors:
Neighbor 1: Index 7929, Distance 5.272975921630859, Vector [0.96233946 0.5737502  0.59273595 0.23098944 0.5369705  0.63797593
 0.42823425 0.24575251 0.8893288  0.54502964 0.8060116  0.65886575
 0.78253627 0.36670887 0.02456753 0.9354817  0.50337505 0.10899781
 0.2375323  0.617193   0.43202353 0.2877622  0.23769969 0.46321324
 0.54506296 0.92509645 0.6306161  0.29780295 0.4218431  0.03696149
 0.3116852  0.390165   0.9549252  0.3775373  0.5620233  0.9112755
 0.1394593  0.2466888  0.9241558  0.86005247 0.7937772  0.9627047
 0.09679138 0.8644842  0.071664   0.19625679 0.01667842 0.68986166
 0.71011275 0.7705593  0.67370415 0.07858868 0.4308906  0.09075476
 0.03766147 0.18467574 0.2782387  0.37127924 0.98378307 0.48489136
 0.22696696 0.07038712 0.22267212 0.10312359]
Neighbor 2: Index 961, Distance 5.2933349609375, Vector [0.9621167  0.2617852  0.48362496 0.88779247 0.4134914  0.52861816
 0.16878773 0.2850794  0.5061142  0.36490148 0.0382557  0.40082905
 0.81510574 0.11605944 0.01873139 0.11870275 0.6868702  0.79464465
 0.04872655 0.8875509  0.62732536 0.5181314  0.2535919  0.37170032
 0.94697326 0.9115464  0.62546456 0.57891124 0.21054466 0.95327854
 0.7553917  0.3822597  0.81583154 0.21187466 0.21322866 0.7909612
 0.559308   0.5558353  0.5736708  0.12580682 0.34955907 0.57307965
 0.24758843 0.50400496 0.55703527 0.9428139  0.2457758  0.43935728
 0.98151124 0.18678987 0.78001946 0.17715496 0.8500466  0.48797393
 0.9721615  0.17007497 0.68792635 0.69527924 0.7188754  0.10096876
 0.288561   0.33801684 0.3242876  0.6750207 ]
  • 8
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 3
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值