人脸识别中的比对查找加速(矩阵分解)和(nmslib)

矩阵分解:

首选说明一下,dot点积运算,定义n维向量的点积运算为

其实,这个点积运算就是1个1*n的行矩阵和1个n*1的列矩阵的矩阵运算。因此,完全可以将点积运算转化为矩阵乘法,就好像深度学习框架中,将卷积运算转化为矩阵乘法实现一样。

       而这里需要说明的就是,如果只是单纯的计算1个点积运算,这样转化为矩阵和不做这样的转化,没有任何意义。而当我们需要做大量的这样的点积运算的时候,这样的意义就尤为凸显。

       再说一下,norm求模运算的优化,假设有m个a向量,n个b向量,如果这时要m个a和n个b两两算cosine距离,每一次计算都要除以a的模和b的模,一共要算m*n次,如果,给ab向量分别归一化,再算cosine,这样只需要m+n次,为什么呢,因为a和b的模都已经变为1。

 

有了上面的知识,进入下面的正题。

目前流行的矩阵加速框架有,多线程openMP,eigen(mkl),armadillo(openblas),cublas(cublasSgemm)

自己测试的加速效果如下,注意这里的Cublas的时间算上了数据从cpu到gpu的时间和结果从gpu到cpu的时间,如果不计算考进考出的时间,gpu无疑是最快的加速方案。

 

基于上面的结果,自己撸了一个通用的加速接口,并且在内部实现了矩阵的分解,可以进行超大矩阵的运算。一句话,只要内存放的下,都可以直接传入函数接口进行矩阵运算。

int matrix_mul(float *A, int A_ROW, int A_COL, float *B, int B_ROW, int B_COL, float *C, string mode, int maxRow)
{
	if (mode == "GPU" || mode == "gpu")
	{
		cublasHandle_t handle;
		cublasStatus_t status = cublasCreate(&handle);
		if (status != CUBLAS_STATUS_SUCCESS)
		{
			if (status == CUBLAS_STATUS_NOT_INITIALIZED) {
				cout << "CUBLAS 对象实例化出错" << endl;
			}
			return EXIT_FAILURE;
		}

			
		if (A_ROW > maxRow)
		{
			float *CC = (float*)malloc(sizeof(float)*A_ROW*B_ROW);
			float *BB = (float*)malloc(sizeof(float)*B_COL*B_ROW);
			for (int it = 0; it < B_ROW; it++)
			{
				for (int itt = 0; itt < B_COL; itt++)
					BB[B_ROW*itt + it] = B[B_COL*it + itt];
			}



			int ranknum = ceil((float)A_ROW / (float)maxRow);
			float *dev_B;
			cudaMalloc((void**)&dev_B, B_COL*B_ROW * sizeof(float));
			CUDA_SAFE_CALL(cudaMemcpy(dev_B, BB, B_COL*B_ROW * sizeof(float), cudaMemcpyHostToDevice));



			for (int i = 0; i < ranknum; i++)
			{

				float *dev_A, *dev_C;
				int lastROW;
				if (i < ranknum - 1)
				{
					lastROW = maxRow;
				}
				else
				{
					lastROW = (A_ROW - (maxRow*i));
				}


				cudaMalloc((void**)&dev_A, lastROW*A_COL * sizeof(float));
				cudaMalloc((void**)&dev_C, lastROW*B_ROW * sizeof(float));
				CUDA_SAFE_CALL(cudaMemcpy(dev_A, A + maxRow*i*A_COL, lastROW*A_COL * sizeof(float), cudaMemcpyHostToDevice));
				CUDA_SAFE_CALL(cudaMemset(dev_C, 0, lastROW*B_ROW * sizeof(float)));

				cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, B_ROW, lastROW, A_COL, &alpha, dev_B, B_ROW, dev_A, A_COL, &beta, dev_C, B_ROW);
				cudaMemcpy(CC + i*maxRow*B_ROW, dev_C, lastROW*B_ROW * sizeof(float), cudaMemcpyDeviceToHost);


				cudaFree(dev_A);
				cudaFree(dev_C);

			}
			for (int it = 0; it < B_ROW; it++)
			{
				for (int itt = 0; itt < A_ROW; itt++)
					C[A_ROW*it + itt] = CC[B_ROW*itt + it];
			}
			free(CC);
			free(BB);
			cudaFree(dev_B);

		}
		else
		{


			float *dev_A, *dev_B, *dev_C;
			cudaMalloc((void**)&dev_A, A_ROW*A_COL * sizeof(float));
			cudaMalloc((void**)&dev_B, B_COL*B_ROW * sizeof(float));
			cudaMalloc((void**)&dev_C, A_ROW*B_ROW * sizeof(float));

			float *CC = (float*)malloc(sizeof(float)*A_ROW*B_ROW);
			float *BB = (float*)malloc(sizeof(float)*B_COL*B_ROW);
			for (int it = 0; it < B_ROW; it++)
			{
				for (int itt = 0; itt < B_COL; itt++)
				{
					BB[B_ROW*itt + it] = B[B_COL*it + itt];
				}
						
			}
			CUDA_SAFE_CALL(cudaMemcpy(dev_A, A, A_ROW*A_COL * sizeof(float), cudaMemcpyHostToDevice));
			CUDA_SAFE_CALL(cudaMemcpy(dev_B, BB, B_COL*B_ROW * sizeof(float), cudaMemcpyHostToDevice));
			CUDA_SAFE_CALL(cudaMemset(dev_C, 0, A_ROW*B_ROW * sizeof(float)));

			cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, B_ROW, A_ROW, A_COL, &alpha, dev_B, B_ROW, dev_A, A_COL, &beta, dev_C, B_ROW);

			cudaMemcpy(CC, dev_C, A_ROW*B_ROW * sizeof(float), cudaMemcpyDeviceToHost);

			for (int it = 0; it < B_ROW; it++)
			{
				for (int itt = 0; itt < A_ROW; itt++)
					C[A_ROW*it + itt] = CC[B_ROW*itt + it];
			}
			free(BB);
			free(CC);
			cudaFree(dev_A);
			cudaFree(dev_B);
			cudaFree(dev_C);

		}


		
		cublasDestroy(handle);

	}
	else if (mode == "CPU" || mode == "cpu")
	{


		if (A_ROW > maxRow)
		{
			if (B_ROW < OMPEIGEN_CHOICE)
			{
				//openMP
				for (int i = 0; i < B_ROW; i++)
				{
#pragma omp parallel for
					for (int j = 0; j < A_ROW; j++)
					{
						double result = 0.0;
						for (int k = 0; k < A_COL; k++)
							result += A[j*A_COL + k] * B[i*A_COL + k];
						C[A_ROW*i + j] = result;
					}

				}

			}else
			{
				int ranknumA = ceil((float)A_ROW / (float)maxRow);
				float *CC_tmp = (float*)malloc(sizeof(float)*B_ROW*A_ROW);
				for (int jj = 0; jj < ranknumA; jj++)
				{
					int lastROWA;
					if (jj < ranknumA - 1)
					{
						lastROWA = maxRow;
					}
					else
					{
						lastROWA = (A_ROW - (maxRow*jj));
					}

					MatrixXf featuremat(lastROWA, A_COL);
					for (int row = 0; row < lastROWA; row++)
						for (int col = 0; col < A_COL; col++)
						{
							featuremat(row, col) = A[row*A_COL + col + jj*maxRow*A_COL];
						}
					MatrixXf vectormat(B_ROW, B_COL);
					for (int row = 0; row < B_ROW; row++)
						for (int col = 0; col < B_COL; col++)
						{
							vectormat(row, col) = B[row*B_COL + col];
						}
					MatrixXf scoremat = vectormat*featuremat.transpose();
					for (int col = 0; col < lastROWA; col++)
						for (int row = 0; row < B_ROW; row++)
						{
							CC_tmp[col*B_ROW + row + jj*maxRow*B_ROW] = scoremat(row, col);
						}
				

				}
				for (int it = 0; it < B_ROW; it++)
				{
					for (int itt = 0; itt < A_ROW; itt++)
					{
						C[A_ROW*it + itt] = CC_tmp[B_ROW*itt + it];
					}
				}
				free(CC_tmp);
			}

		}
		else
		{
			if (B_ROW < OMPEIGEN_CHOICE)
			{
				//openMP
				for (int i = 0; i < B_ROW; i++)
				{
#pragma omp parallel for
					for (int j = 0; j < A_ROW; j++)
					{
						double result = 0.0;
						for (int k = 0; k < A_COL; k++)
							result += A[j*A_COL + k] * B[i*A_COL + k];
						C[A_ROW*i + j] = result;
					}

				}

			}
			else
			{
				//eigen
				MatrixXf featuremat(A_ROW, A_COL);
				for (int row = 0; row < A_ROW; row++)
					for (int col = 0; col < A_COL; col++)
					{
						featuremat(row, col) = A[row*A_COL + col];
					}
				MatrixXf vectormat(B_ROW, B_COL);
				for (int row = 0; row < B_ROW; row++)
					for (int col = 0; col < B_COL; col++)
					{
						vectormat(row, col) = B[row*B_COL + col];
					}
				MatrixXf scoremat = featuremat*vectormat.transpose();
				for (int col = 0; col < B_ROW; col++)
					for (int row = 0; row < A_ROW; row++)
					{
						C[col*A_ROW + row] = scoremat(row, col);
					}

			}

		}


//armadillo
		/*arma::mat featuremat( A_ROW, A_COL);
		for (int row = 0; row < A_ROW; row++)
			for (int col = 0; col < A_COL; col++)
			{
				featuremat(row, col) = A[row*A_COL + col];
			}
		arma::mat vectormat(B_ROW, B_COL);
		for (int row = 0; row < B_ROW; row++)
			for (int col = 0; col < B_COL; col++)
			{
				vectormat(row, col) = B[row*B_COL + col];
			}
		arma::mat scoremat = featuremat*vectormat.t();
		for (int col = 0; col < B_ROW; col++)
			for (int row = 0; row < A_ROW; row++)
			{
				C[col*A_ROW + row] = scoremat(row, col);
			}*/

	}
	else
	{
		cout << "wrong mode,mode must be one of GPU,gpu,CPU,cpu" << endl;
	}
	

	return 1;
}

 

 

 

 

 

int matrix_mul_NN(float *A, int A_ROW, int A_COL, float *B, int B_ROW, int B_COL, float *C, string mode, int maxRow)
{
	if (A_COL != B_COL)
	{
		cout << "matrix A's col is not equal to matrix B's row" << endl;
		return 0;
	}
	__TIC__();

	if (B_ROW > maxRow)
	{
		int ranknumB = ceil((float)B_ROW / (float)maxRow);
		for (int j = 0; j < ranknumB; j++)
		{
			int lastROWB;
			if (j < ranknumB - 1)
			{
				lastROWB = maxRow;
			}
			else
			{
				lastROWB = (B_ROW - (maxRow*j));
			}
			float *B_tmp = (float*)malloc(sizeof(float)*B_COL*lastROWB);
			memcpy(B_tmp, B + j*B_COL*maxRow, sizeof(float)*B_COL*lastROWB);
			float *C_tmp = (float*)malloc(sizeof(float)*A_ROW*lastROWB);
			matrix_mul(A, A_ROW, A_COL, B_tmp, lastROWB, B_COL, C_tmp, mode, maxRow);
			memcpy(C + j*A_ROW*maxRow, C_tmp, sizeof(float)*A_ROW*lastROWB);
			free(B_tmp);
			free(C_tmp);

		}
	}
	else
		{
			matrix_mul(A, A_ROW, A_COL, B, B_ROW, B_COL, C, mode, maxRow);
		}

	__TOC__();
	return 1;
}


细节不再赘述,The End!

 

nmslib库:
createIndex,建立整个树形结构,

saveIndex,保存树形结构,

loadIndex,导入已经建好的数据结构,

addDataPoint,addDataPointBatch,动态的增加新的数据

knnQuery,knnQueryBatch,实现最相似的top N数据查找,可以返回top N的索引和相似度得分。

github:https://github.com/nmslib/nmslib

文档:https://nmslib.github.io/nmslib/api.html

安装:pip install nmslib

# coding:utf-8
 
import numpy as np
import nmslib
import datetime
from functools import wraps
 
 
def func_execute_time(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start_time = datetime.datetime.now()
        res = func(*args, **kwargs)
        end_time = datetime.datetime.now()
        duration_time = (end_time - start_time).microseconds // 1000
        print("execute function %s, elapse time %.4f ms" % (func.__name__, duration_time))
        return res
    return wrapper
 
 
def create_indexer(vec, index_thread, m, ef):
    """
    基于数据向量构建索引
    :param vec: 原始数据向量
    :param index_thread: 线程数
    :param m: 参照官网
    :param ef: 参照官网
    :return:
    """
    index = nmslib.init(method="hnsw", space="l2")
    index.addDataPointBatch(vec)
    INDEX_TIME_PARAMS = {
        "indexThreadQty": index_thread,
        "M": m,
        "efConstruction": ef,
        "post": 2
    }
    index.createIndex(INDEX_TIME_PARAMS, print_progress=True)
    index.saveIndex("data_%d_%d_%d.hnsw" % (index_thread, m, ef))
 
 
def load_indexer(index_path, ef_search=300):
    """
    加载构建好的向量索引文件
    :param index_path: 索引文件地址
    :param ef_search: 查询结果参数
    :return:
    """
    indexer = nmslib.init(method="hnsw", space="l2")
    indexer.loadIndex(index_path)
    indexer.setQueryTimeParams({"efSearch": ef_search})
    return indexer
 

@func_execute_time
def search_vec_top_n(indexer, vecs, top_n=7, threads=4):
    """
    使用构建好的索引文件完成向量查询
    :param indexer: 索引
    :param vecs: 待查询向量
    :param top_n: 返回前top_n个查询结果
    :param threads:
    :return:
    """
    neighbours = indexer.knnQuery(vecs, k=top_n)
    return neighbours
 


 
@func_execute_time
def search_batch_vec_top_n(indexer, vecs, top_n=7, threads=4):
    """
    使用构建好的索引文件完成向量查询
    :param indexer: 索引
    :param vecs: 待查询向量
    :param top_n: 返回前top_n个查询结果
    :param threads:
    :return:
    """
    neighbours = indexer.knnQueryBatch(vecs, k=top_n, num_threads=threads)
    return neighbours
 
 
if __name__ == '__main__':
    data = np.arange(300000000).reshape(1000000, 300)
    print(data.shape)
    create_indexer(data, 10, 5, 300)
    indexer = load_indexer("data_10_5_300.hnsw")
 
    print(search_vec_top_n(indexer, data[0].reshape(1,-1)))
    print(search_batch_vec_top_n(indexer, data[0].reshape(1,-1)))
    print(search_batch_vec_top_n(indexer, data[:10]))

 

 

 

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值