faiss k-means 暂记

安装


conda install faiss-gpu=1.5.3 ## python=3.6 可以正常使用k-means的版本

conda install -c pytorch faiss-gpu

#安装gpu版本
#确保已经安装了CUDA,否则会自动安装cup版本。
conda install faiss-gpu -c pytorch # 默认 For CUDA8.0
conda install faiss-gpu cuda90 -c pytorch # For CUDA9.0
conda install faiss-gpu cuda91 -c pytorch # For CUDA9.1

使用

Kmeans

#导入faiss
import sys
sys.path.append('/home/maliqi/faiss/python/')
import faiss

#数据
import numpy as np
d = 512          #维数
n_data = 2000
np.random.seed(0)
data = []
mu = 3
sigma = 0.1
for i in range(n_data):
    data.append(np.random.normal(mu, sigma, d))
data = np.array(data).astype('float32') # 注意数据类型 !!!

# 聚类
ncentroids = 1024
niter = 20
verbose = True
d = data.shape[1]
kmeans = faiss.Kmeans(d, ncentroids)
kmeans.train(data)

#输出聚类中心
print(kmeans.centroids)

HNSW

向量编码Index

  • 可用的编码是(从最小压缩到最强压缩):
    完全没有编码():向量在没有压缩的情况下存储;IndexFlat
    16 位浮点编码(带):向量被压缩为 16 位浮点数,这可能会导致一些精度损失;IndexScalarQuantizerQT_fp16
    8/6/4 位整数编码(带 //):量化为 256/64/16 级的向量;IndexScalarQuantizerQT_8bitQT_6bitQT_4bit
    PQ编码():向量被分成子向量,每个子向量被量化为几位(通常为8位)。请参阅下面的示例。IndexPQ
    残差编码():向量被量化并通过残差逐步细化。在每个量化阶段,可以细化码本的大小。IndexResidual
    在这里插入图片描述

在这里插入图片描述

Search

            for sent_feat_info in doc_feat[1:]:# 循环句子
                sent_id, sent, sec, text_emb = sent_feat_info

                #text_emb = text_emb.cpu().numpy()
                # fassi feature recall
                emb_text = np.array(text_emb).astype(np.float32)
                D, I = self.index.search(emb_text, topk)# 检索
                res_imgid = [self.imgidx[x] for x in I.tolist()[0]]
                res_sim = D.tolist()[0]
                res_emb = [self.data[x] for x in I.tolist()[0]]

                print (sent, sec, res_imgid)
                doc_fas.append([title, sent, sec, res_imgid, res_sim, res_emb])# 结果转换保存
            out_info.append(doc_fas)
        return out_info

参考与更多

https://faiss.ai/#

faiss documentation python

https://github.com/facebookresearch/faiss/wiki/

faiss中文维基(基础教程)

https://github.com/CCCBora/faiss_chat

  • FAISS Chat: 和本地数据库聊天!
  • https://www.bilibili.com/video/BV11k4y1W7ZE/?

在这里插入图片描述

https://github.com/liqima/faiss_note/blob/master/3.Basics%20%E5%9F%BA%E7%A1%80%E6%A8%A1%E5%9D%97.ipynb

 # faiss implementation of k-means
    clus = faiss.Clustering(d, nmb_clusters)

Faiss building blocks: clustering, PCA, quantization

Runs kmeans on 1 GPU.

Running-on-GPUs
类似的代码

deepcluster = Kmeans(args.n_clusters, knn=1)
clustering_loss, _ = deepcluster.cluster(features, verbose=True)
return deepcluster.centroids, clustering_loss
class Kmeans:
    def __init__(self, k, knn=1):
        self.k = k
        self.knn = knn
        self.centroids = None
        self.labels = None
        self.images_lists = []
        self.dists = None

    def cluster(self, data,  verbose=False):
        """Performs k-means clustering.
            Args:
            x_data (np.array N * dim): data to cluster
        """
        start = time.time()
        # cluster the data
        labels, loss, self.centroids, self.dists = run_kmeans(data, self.k, verbose, self.knn)
        self.labels = labels
        self.images_lists = [[] for i in range(self.k)]
        for i in range(len(data)):
            self.images_lists[labels[i]].append(i)

        if verbose:
            print('k-means time: {0:.0f} s'.format(time.time() - start))

        return loss, self.dists
def run_kmeans(x, nmb_clusters, verbose=False, knn=1):
    """Runs kmeans on 1 GPU.
    Args:
        x: data
        nmb_clusters (int): number of clusters
    Returns:
        list: ids of data in each cluster
    """
    n_data, d = x.shape # 560900 , 1152

    # faiss implementation of k-means
    nmb_clusters = int(nmb_clusters)
    clus = faiss.Clustering(d, nmb_clusters) # d=1152,nmb_clusters=10
    clus.niter = 30
    clus.max_points_per_centroid = 10000000
    res = faiss.StandardGpuResources()
    flat_config = faiss.GpuIndexFlatConfig()
    flat_config.useFloat16 = False
    flat_config.device = 0
    index = faiss.GpuIndexFlatL2(res, d, flat_config)

    # perform the training
    clus.train(x, index) # ndarray(560900,1152)
    dists, labels = index.search(x, knn)
    losses = faiss.vector_to_array(clus.obj)
    centroids = faiss.vector_float_to_array(clus.centroids).reshape(nmb_clusters, d)
    if verbose:
        print('k-means loss evolution: {0}'.format(losses))

    return [int(n[0]) for n in labels], losses[-1], centroids, dists
import time
import faiss
import torch
import numpy as np


def compute_features(dataloader, model, args, use_predict_fn=False, concat_vid=False, keep_dim=False):
    cargs = args
    if cargs.verbose:
        print('Compute features')
    start = time.time()
    model.eval()
    features = []
    # discard the label information in the dataloader
    for i, data_arr in enumerate(dataloader):
        pose_data = data_arr[0]
        with torch.no_grad():
            data = pose_data.to(args.device)
            if use_predict_fn:
                pose_features = model.predict(data).data.to('cpu', non_blocking=True).numpy()
                pose_features = pose_features.reshape(-1, 1)
            else:
                pose_features = model(data)
                if isinstance(pose_features, (list, tuple)):
                    pose_features = pose_features[0]
                pose_features = pose_features.data.to('cpu', non_blocking=True).numpy()

        if concat_vid:  # Concatenate each clip's video features to its pose embedding
            vid_features = data_arr[1]
            batch_features = np.concatenate([pose_features, vid_features], axis=1)
        else:
            batch_features = pose_features

        features.append(batch_features)

        # measure elapsed time
        batch_time = time.time() - start
        start = time.time()

        if cargs.verbose and (i % 200) == 0:
            print('{0} / {1}\t'
                  'Time: {batch_time:.3f})'
                  .format(i, len(dataloader), batch_time=batch_time))
    features = np.concatenate(features)
    if keep_dim:
        n, c, t, v = data.size()
        features = features.reshape(features.shape[0], -1, v)
    return features


class Kmeans:
    def __init__(self, k, knn=1):
        self.k = k
        self.knn = knn
        self.centroids = None
        self.labels = None
        self.images_lists = []
        self.dists = None

    def cluster(self, data,  verbose=False):
        """Performs k-means clustering.
            Args:
            x_data (np.array N * dim): data to cluster
        """
        start = time.time()
        # cluster the data
        labels, loss, self.centroids, self.dists = run_kmeans(data, self.k, verbose, self.knn)
        self.labels = labels
        self.images_lists = [[] for i in range(self.k)]
        for i in range(len(data)):
            self.images_lists[labels[i]].append(i)

        if verbose:
            print('k-means time: {0:.0f} s'.format(time.time() - start))

        return loss, self.dists


def run_kmeans(x, nmb_clusters, verbose=False, knn=1):
    """Runs kmeans on 1 GPU.
    Args:
        x: data
        nmb_clusters (int): number of clusters
    Returns:
        list: ids of data in each cluster
    """
    n_data, d = x.shape

    # faiss implementation of k-means
    nmb_clusters = int(nmb_clusters)
    clus = faiss.Clustering(d, nmb_clusters)
    clus.niter = 30
    clus.max_points_per_centroid = 10000000
    res = faiss.StandardGpuResources()
    flat_config = faiss.GpuIndexFlatConfig()
    flat_config.useFloat16 = False
    flat_config.device = 0
    index = faiss.GpuIndexFlatL2(res, d, flat_config)

    # perform the training
    clus.train(x, index)
    dists, labels = index.search(x, knn)
    losses = faiss.vector_to_array(clus.obj)# !!!
    centroids = faiss.vector_float_to_array(clus.centroids).reshape(nmb_clusters, d)
    if verbose:
        print('k-means loss evolution: {0}'.format(losses))

    return [int(n[0]) for n in labels], losses[-1], centroids, dists



if __name__ == "__main__":
    data = np.random.randn(560900,1152)
    labels, loss, centroids, dists = run_kmeans(data, 10, verbose=True, knn=1)
    print(123)

https://pypi.tuna.tsinghua.edu.cn/simple/faiss-gpu/

在这里插入图片描述

geng

video:Milvus 问答 #12:新版本、Postgres向量检索插件、比Faiss好用?

Python faiss.Clustering方法代码示例

  • Classification is a Strong Baseline for Deep Metric Learning (BMVC '19)
    深度度量学习旨在学习一种将图像像素映射到嵌入特征向量的函数,该特征向量对图像之间的相似性进行建模。度量学习的两个主要应用是基于内容的图像检索和人脸验证。对于检索任务,大多数当前最先进的(SOTA)方法是基于三元组的非参数化训练。然而,对于人脸验证任务,最近的SOTA方法采用了基于分类的参数化训练。在本文中,我们研究了基于分类的方法在图像检索数据集上的有效性。我们评估了几个标准检索数据集,如CAR-196,CUB-200-2011,斯坦福在线产品和In-Shop数据集,用于图像检索和聚类,并确定我们基于分类的方法在不同的特征维度和基本特征网络中具有竞争力。我们进一步深入分析了子采样类的性能效应,以实现可扩展的基于分类的训练,以及二值化的影响,从而为实际应用提供高效的存储和计算。

  • 当数据很多,聚类中心也很多时无法正常运行kmeans_pytorch

  • https://github.com/DeMoriarty/fast_pytorch_kmeans

# pip install fast-pytorch-kmeans


from fast_pytorch_kmeans import KMeans
import torch
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

kmeans = KMeans(n_clusters=8, mode='euclidean', verbose=1)
x = torch.randn(100000, 64, device='cuda').float()
labels = kmeans.fit_predict(x)


# Traceback (most recent call last):
#   File "/---/ubuntu/---/---/gepc-master/fasttest.py", line 12, in <module>
#     labels = kmeans.fit_predict(x)
#   File "/home/ubuntu/anaconda3/envs/ngepc/lib/python3.6/site-packages/fast_pytorch_kmeans/kmeans.py", line 215, in fit_predict
#     self.num_points_in_clusters[matched_clusters] += counts
# RuntimeError: expected device cuda:0 and dtype Float but got device cuda:0 and dtype Long
  • 1
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值