faiss k-means 暂记

FakeOccupational

已于 2023-06-08 18:54:34 修改

阅读量2.2k

点赞数 1

分类专栏：深度学习文章标签： faiss python 聚类

于 2022-09-07 10:00:00 首次发布

本文链接：https://blog.csdn.net/ResumeProject/article/details/126706801

版权

深度学习专栏收录该内容

175 篇文章

订阅专栏

安装


conda install faiss-gpu=1.5.3 ## python=3.6 可以正常使用k-means的版本

conda install -c pytorch faiss-gpu

#安装gpu版本
#确保已经安装了CUDA，否则会自动安装cup版本。
conda install faiss-gpu -c pytorch # 默认 For CUDA8.0
conda install faiss-gpu cuda90 -c pytorch # For CUDA9.0
conda install faiss-gpu cuda91 -c pytorch # For CUDA9.1

使用

Kmeans

#导入faiss
import sys
sys.path.append('/home/maliqi/faiss/python/')
import faiss

#数据
import numpy as np
d = 512          #维数
n_data = 2000
np.random.seed(0)
data = []
mu = 3
sigma = 0.1
for i in range(n_data):
    data.append(np.random.normal(mu, sigma, d))
data = np.array(data).astype('float32') # 注意数据类型 ！！！

# 聚类
ncentroids = 1024
niter = 20
verbose = True
d = data.shape[1]
kmeans = faiss.Kmeans(d, ncentroids)
kmeans.train(data)

#输出聚类中心
print(kmeans.centroids)

HNSW

检索 : Approximate Nearest Neighbor NSW + HNSW https://blog.csdn.net/ResumeProject/article/details/122182837
Faiss中HNSW代码讲解:https://www.bilibili.com/video/BV1q8411t7YU/?
https://github.com/facebookresearch/faiss/wiki/Faiss-indexes
召回技术(向量检索工具faiss篇)*

向量编码Index

可用的编码是（从最小压缩到最强压缩）：
完全没有编码（）：向量在没有压缩的情况下存储;IndexFlat
16 位浮点编码（带）：向量被压缩为 16 位浮点数，这可能会导致一些精度损失;IndexScalarQuantizerQT_fp16
8/6/4 位整数编码（带 //）：量化为 256/64/16 级的向量;IndexScalarQuantizerQT_8bitQT_6bitQT_4bit
PQ编码（）：向量被分成子向量，每个子向量被量化为几位（通常为8位）。请参阅下面的示例。IndexPQ
残差编码（）：向量被量化并通过残差逐步细化。在每个量化阶段，可以细化码本的大小。IndexResidual

在这里插入图片描述

Search

faiss不同索引类型介绍和源码分析

            for sent_feat_info in doc_feat[1:]:# 循环句子
                sent_id, sent, sec, text_emb = sent_feat_info

                #text_emb = text_emb.cpu().numpy()
                # fassi feature recall
                emb_text = np.array(text_emb).astype(np.float32)
                D, I = self.index.search(emb_text, topk)# 检索
                res_imgid = [self.imgidx[x] for x in I.tolist()[0]]
                res_sim = D.tolist()[0]
                res_emb = [self.data[x] for x in I.tolist()[0]]

                print (sent, sec, res_imgid)
                doc_fas.append([title, sent, sec, res_imgid, res_sim, res_emb])# 结果转换保存
            out_info.append(doc_fas)
        return out_info

参考与更多

https://faiss.ai/#

faiss documentation python

https://github.com/facebookresearch/faiss/wiki/

faiss中文维基(基础教程)

https://github.com/CCCBora/faiss_chat

FAISS Chat: 和本地数据库聊天!
https://www.bilibili.com/video/BV11k4y1W7ZE/?

在这里插入图片描述

聚类的c++源码解析

https://github.com/liqima/faiss_note/blob/master/3.Basics%20%E5%9F%BA%E7%A1%80%E6%A8%A1%E5%9D%97.ipynb

 # faiss implementation of k-means
    clus = faiss.Clustering(d, nmb_clusters)

Faiss building blocks: clustering, PCA, quantization

Runs kmeans on 1 GPU.

Running-on-GPUs
类似的代码

deepcluster = Kmeans(args.n_clusters, knn=1)
clustering_loss, _ = deepcluster.cluster(features, verbose=True)
return deepcluster.centroids, clustering_loss

class Kmeans:
    def __init__(self, k, knn=1):
        self.k = k
        self.knn = knn
        self.centroids = None
        self.labels = None
        self.images_lists = []
        self.dists = None

    def cluster(self, data,  verbose=False):
        """Performs k-means clustering.
            Args:
            x_data (np.array N * dim): data to cluster
        """
        start = time.time()
        # cluster the data
        labels, loss, self.centroids, self.dists = run_kmeans(data, self.k, verbose, self.knn)
        self.labels = labels
        self.images_lists = [[] for i in range(self.k)]
        for i in range(len(data)):
            self.images_lists[labels[i]].append(i)

        if verbose:
            print('k-means time: {0:.0f} s'.format(time.time() - start))

        return loss, self.dists

def run_kmeans(x, nmb_clusters, verbose=False, knn=1):
    """Runs kmeans on 1 GPU.
    Args:
        x: data
        nmb_clusters (int): number of clusters
    Returns:
        list: ids of data in each cluster
    """
    n_data, d = x.shape # 560900 , 1152

    # faiss implementation of k-means
    nmb_clusters = int(nmb_clusters)
    clus = faiss.Clustering(d, nmb_clusters) # d=1152,nmb_clusters=10
    clus.niter = 30
    clus.max_points_per_centroid = 10000000
    res = faiss.StandardGpuResources()
    flat_config = faiss.GpuIndexFlatConfig()
    flat_config.useFloat16 = False
    flat_config.device = 0
    index = faiss.GpuIndexFlatL2(res, d, flat_config)

    # perform the training
    clus.train(x, index) # ndarray(560900,1152)
    dists, labels = index.search(x, knn)
    losses = faiss.vector_to_array(clus.obj)
    centroids = faiss.vector_float_to_array(clus.centroids).reshape(nmb_clusters, d)
    if verbose:
        print('k-means loss evolution: {0}'.format(losses))

    return [int(n[0]) for n in labels], losses[-1], centroids, dists

import time
import faiss
import torch
import numpy as np


def compute_features(dataloader, model, args, use_predict_fn=False, concat_vid=False, keep_dim=False):
    cargs = args
    if cargs.verbose:
        print('Compute features')
    start = time.time()
    model.eval()
    features = []
    # discard the label information in the dataloader
    for i, data_arr in enumerate(dataloader):
        pose_data = data_arr[0]
        with torch.no_grad():
            data = pose_data.to(args.device)
            if use_predict_fn:
                pose_features = model.predict(data).data.to('cpu', non_blocking=True).numpy()
                pose_features = pose_features.reshape(-1, 1)
            else:
                pose_features = model(data)
                if isinstance(pose_features, (list, tuple)):
                    pose_features = pose_features[0]
                pose_features = pose_features.data.to('cpu', non_blocking=True).numpy()

        if concat_vid:  # Concatenate each clip's video features to its pose embedding
            vid_features = data_arr[1]
            batch_features = np.concatenate([pose_features, vid_features], axis=1)
        else:
            batch_features = pose_features

        features.append(batch_features)

        # measure elapsed time
        batch_time = time.time() - start
        start = time.time()

        if cargs.verbose and (i % 200) == 0:
            print('{0} / {1}\t'
                  'Time: {batch_time:.3f})'
                  .format(i, len(dataloader), batch_time=batch_time))
    features = np.concatenate(features)
    if keep_dim:
        n, c, t, v = data.size()
        features = features.reshape(features.shape[0], -1, v)
    return features


class Kmeans:
    def __init__(self, k, knn=1):
        self.k = k
        self.knn = knn
        self.centroids = None
        self.labels = None
        self.images_lists = []
        self.dists = None

    def cluster(self, data,  verbose=False):
        """Performs k-means clustering.
            Args:
            x_data (np.array N * dim): data to cluster
        """
        start = time.time()
        # cluster the data
        labels, loss, self.centroids, self.dists = run_kmeans(data, self.k, verbose, self.knn)
        self.labels = labels
        self.images_lists = [[] for i in range(self.k)]
        for i in range(len(data)):
            self.images_lists[labels[i]].append(i)

        if verbose:
            print('k-means time: {0:.0f} s'.format(time.time() - start))

        return loss, self.dists


def run_kmeans(x, nmb_clusters, verbose=False, knn=1):
    """Runs kmeans on 1 GPU.
    Args:
        x: data
        nmb_clusters (int): number of clusters
    Returns:
        list: ids of data in each cluster
    """
    n_data, d = x.shape

    # faiss implementation of k-means
    nmb_clusters = int(nmb_clusters)
    clus = faiss.Clustering(d, nmb_clusters)
    clus.niter = 30
    clus.max_points_per_centroid = 10000000
    res = faiss.StandardGpuResources()
    flat_config = faiss.GpuIndexFlatConfig()
    flat_config.useFloat16 = False
    flat_config.device = 0
    index = faiss.GpuIndexFlatL2(res, d, flat_config)

    # perform the training
    clus.train(x, index)
    dists, labels = index.search(x, knn)
    losses = faiss.vector_to_array(clus.obj)# !!!
    centroids = faiss.vector_float_to_array(clus.centroids).reshape(nmb_clusters, d)
    if verbose:
        print('k-means loss evolution: {0}'.format(losses))

    return [int(n[0]) for n in labels], losses[-1], centroids, dists



if __name__ == "__main__":
    data = np.random.randn(560900,1152)
    labels, loss, centroids, dists = run_kmeans(data, 10, verbose=True, knn=1)
    print(123)

https://pypi.tuna.tsinghua.edu.cn/simple/faiss-gpu/

在这里插入图片描述

geng

video:Milvus 问答 #12：新版本、Postgres向量检索插件、比Faiss好用？

Python faiss.Clustering方法代码示例

Classification is a Strong Baseline for Deep Metric Learning (BMVC '19)
深度度量学习旨在学习一种将图像像素映射到嵌入特征向量的函数，该特征向量对图像之间的相似性进行建模。度量学习的两个主要应用是基于内容的图像检索和人脸验证。对于检索任务，大多数当前最先进的（SOTA）方法是基于三元组的非参数化训练。然而，对于人脸验证任务，最近的SOTA方法采用了基于分类的参数化训练。在本文中，我们研究了基于分类的方法在图像检索数据集上的有效性。我们评估了几个标准检索数据集，如CAR-196，CUB-200-2011，斯坦福在线产品和In-Shop数据集，用于图像检索和聚类，并确定我们基于分类的方法在不同的特征维度和基本特征网络中具有竞争力。我们进一步深入分析了子采样类的性能效应，以实现可扩展的基于分类的训练，以及二值化的影响，从而为实际应用提供高效的存储和计算。
当数据很多，聚类中心也很多时无法正常运行kmeans_pytorch
https://github.com/DeMoriarty/fast_pytorch_kmeans

# pip install fast-pytorch-kmeans


from fast_pytorch_kmeans import KMeans
import torch
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

kmeans = KMeans(n_clusters=8, mode='euclidean', verbose=1)
x = torch.randn(100000, 64, device='cuda').float()
labels = kmeans.fit_predict(x)


# Traceback (most recent call last):
#   File "/---/ubuntu/---/---/gepc-master/fasttest.py", line 12, in <module>
#     labels = kmeans.fit_predict(x)
#   File "/home/ubuntu/anaconda3/envs/ngepc/lib/python3.6/site-packages/fast_pytorch_kmeans/kmeans.py", line 215, in fit_predict
#     self.num_points_in_clusters[matched_clusters] += counts
# RuntimeError: expected device cuda:0 and dtype Float but got device cuda:0 and dtype Long