矩阵分解:
首选说明一下,dot点积运算,定义n维向量的点积运算为
其实,这个点积运算就是1个1*n的行矩阵和1个n*1的列矩阵的矩阵运算。因此,完全可以将点积运算转化为矩阵乘法,就好像深度学习框架中,将卷积运算转化为矩阵乘法实现一样。
而这里需要说明的就是,如果只是单纯的计算1个点积运算,这样转化为矩阵和不做这样的转化,没有任何意义。而当我们需要做大量的这样的点积运算的时候,这样的意义就尤为凸显。
再说一下,norm求模运算的优化,假设有m个a向量,n个b向量,如果这时要m个a和n个b两两算cosine距离,每一次计算都要除以a的模和b的模,一共要算m*n次,如果,给a,b向量分别归一化,再算cosine,这样只需要m+n次,为什么呢,因为a和b的模都已经变为1。
有了上面的知识,进入下面的正题。
目前流行的矩阵加速框架有,多线程openMP,eigen(mkl),armadillo(openblas),cublas(cublasSgemm)
自己测试的加速效果如下,注意这里的Cublas的时间算上了数据从cpu到gpu的时间和结果从gpu到cpu的时间,如果不计算考进考出的时间,gpu无疑是最快的加速方案。
基于上面的结果,自己撸了一个通用的加速接口,并且在内部实现了矩阵的分解,可以进行超大矩阵的运算。一句话,只要内存放的下,都可以直接传入函数接口进行矩阵运算。
int matrix_mul(float *A, int A_ROW, int A_COL, float *B, int B_ROW, int B_COL, float *C, string mode, int maxRow)
{
if (mode == "GPU" || mode == "gpu")
{
cublasHandle_t handle;
cublasStatus_t status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
if (status == CUBLAS_STATUS_NOT_INITIALIZED) {
cout << "CUBLAS 对象实例化出错" << endl;
}
return EXIT_FAILURE;
}
if (A_ROW > maxRow)
{
float *CC = (float*)malloc(sizeof(float)*A_ROW*B_ROW);
float *BB = (float*)malloc(sizeof(float)*B_COL*B_ROW);
for (int it = 0; it < B_ROW; it++)
{
for (int itt = 0; itt < B_COL; itt++)
BB[B_ROW*itt + it] = B[B_COL*it + itt];
}
int ranknum = ceil((float)A_ROW / (float)maxRow);
float *dev_B;
cudaMalloc((void**)&dev_B, B_COL*B_ROW * sizeof(float));
CUDA_SAFE_CALL(cudaMemcpy(dev_B, BB, B_COL*B_ROW * sizeof(float), cudaMemcpyHostToDevice));
for (int i = 0; i < ranknum; i++)
{
float *dev_A, *dev_C;
int lastROW;
if (i < ranknum - 1)
{
lastROW = maxRow;
}
else
{
lastROW = (A_ROW - (maxRow*i));
}
cudaMalloc((void**)&dev_A, lastROW*A_COL * sizeof(float));
cudaMalloc((void**)&dev_C, lastROW*B_ROW * sizeof(float));
CUDA_SAFE_CALL(cudaMemcpy(dev_A, A + maxRow*i*A_COL, lastROW*A_COL * sizeof(float), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemset(dev_C, 0, lastROW*B_ROW * sizeof(float)));
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, B_ROW, lastROW, A_COL, &alpha, dev_B, B_ROW, dev_A, A_COL, &beta, dev_C, B_ROW);
cudaMemcpy(CC + i*maxRow*B_ROW, dev_C, lastROW*B_ROW * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(dev_A);
cudaFree(dev_C);
}
for (int it = 0; it < B_ROW; it++)
{
for (int itt = 0; itt < A_ROW; itt++)
C[A_ROW*it + itt] = CC[B_ROW*itt + it];
}
free(CC);
free(BB);
cudaFree(dev_B);
}
else
{
float *dev_A, *dev_B, *dev_C;
cudaMalloc((void**)&dev_A, A_ROW*A_COL * sizeof(float));
cudaMalloc((void**)&dev_B, B_COL*B_ROW * sizeof(float));
cudaMalloc((void**)&dev_C, A_ROW*B_ROW * sizeof(float));
float *CC = (float*)malloc(sizeof(float)*A_ROW*B_ROW);
float *BB = (float*)malloc(sizeof(float)*B_COL*B_ROW);
for (int it = 0; it < B_ROW; it++)
{
for (int itt = 0; itt < B_COL; itt++)
{
BB[B_ROW*itt + it] = B[B_COL*it + itt];
}
}
CUDA_SAFE_CALL(cudaMemcpy(dev_A, A, A_ROW*A_COL * sizeof(float), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(dev_B, BB, B_COL*B_ROW * sizeof(float), cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemset(dev_C, 0, A_ROW*B_ROW * sizeof(float)));
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, B_ROW, A_ROW, A_COL, &alpha, dev_B, B_ROW, dev_A, A_COL, &beta, dev_C, B_ROW);
cudaMemcpy(CC, dev_C, A_ROW*B_ROW * sizeof(float), cudaMemcpyDeviceToHost);
for (int it = 0; it < B_ROW; it++)
{
for (int itt = 0; itt < A_ROW; itt++)
C[A_ROW*it + itt] = CC[B_ROW*itt + it];
}
free(BB);
free(CC);
cudaFree(dev_A);
cudaFree(dev_B);
cudaFree(dev_C);
}
cublasDestroy(handle);
}
else if (mode == "CPU" || mode == "cpu")
{
if (A_ROW > maxRow)
{
if (B_ROW < OMPEIGEN_CHOICE)
{
//openMP
for (int i = 0; i < B_ROW; i++)
{
#pragma omp parallel for
for (int j = 0; j < A_ROW; j++)
{
double result = 0.0;
for (int k = 0; k < A_COL; k++)
result += A[j*A_COL + k] * B[i*A_COL + k];
C[A_ROW*i + j] = result;
}
}
}else
{
int ranknumA = ceil((float)A_ROW / (float)maxRow);
float *CC_tmp = (float*)malloc(sizeof(float)*B_ROW*A_ROW);
for (int jj = 0; jj < ranknumA; jj++)
{
int lastROWA;
if (jj < ranknumA - 1)
{
lastROWA = maxRow;
}
else
{
lastROWA = (A_ROW - (maxRow*jj));
}
MatrixXf featuremat(lastROWA, A_COL);
for (int row = 0; row < lastROWA; row++)
for (int col = 0; col < A_COL; col++)
{
featuremat(row, col) = A[row*A_COL + col + jj*maxRow*A_COL];
}
MatrixXf vectormat(B_ROW, B_COL);
for (int row = 0; row < B_ROW; row++)
for (int col = 0; col < B_COL; col++)
{
vectormat(row, col) = B[row*B_COL + col];
}
MatrixXf scoremat = vectormat*featuremat.transpose();
for (int col = 0; col < lastROWA; col++)
for (int row = 0; row < B_ROW; row++)
{
CC_tmp[col*B_ROW + row + jj*maxRow*B_ROW] = scoremat(row, col);
}
}
for (int it = 0; it < B_ROW; it++)
{
for (int itt = 0; itt < A_ROW; itt++)
{
C[A_ROW*it + itt] = CC_tmp[B_ROW*itt + it];
}
}
free(CC_tmp);
}
}
else
{
if (B_ROW < OMPEIGEN_CHOICE)
{
//openMP
for (int i = 0; i < B_ROW; i++)
{
#pragma omp parallel for
for (int j = 0; j < A_ROW; j++)
{
double result = 0.0;
for (int k = 0; k < A_COL; k++)
result += A[j*A_COL + k] * B[i*A_COL + k];
C[A_ROW*i + j] = result;
}
}
}
else
{
//eigen
MatrixXf featuremat(A_ROW, A_COL);
for (int row = 0; row < A_ROW; row++)
for (int col = 0; col < A_COL; col++)
{
featuremat(row, col) = A[row*A_COL + col];
}
MatrixXf vectormat(B_ROW, B_COL);
for (int row = 0; row < B_ROW; row++)
for (int col = 0; col < B_COL; col++)
{
vectormat(row, col) = B[row*B_COL + col];
}
MatrixXf scoremat = featuremat*vectormat.transpose();
for (int col = 0; col < B_ROW; col++)
for (int row = 0; row < A_ROW; row++)
{
C[col*A_ROW + row] = scoremat(row, col);
}
}
}
//armadillo
/*arma::mat featuremat( A_ROW, A_COL);
for (int row = 0; row < A_ROW; row++)
for (int col = 0; col < A_COL; col++)
{
featuremat(row, col) = A[row*A_COL + col];
}
arma::mat vectormat(B_ROW, B_COL);
for (int row = 0; row < B_ROW; row++)
for (int col = 0; col < B_COL; col++)
{
vectormat(row, col) = B[row*B_COL + col];
}
arma::mat scoremat = featuremat*vectormat.t();
for (int col = 0; col < B_ROW; col++)
for (int row = 0; row < A_ROW; row++)
{
C[col*A_ROW + row] = scoremat(row, col);
}*/
}
else
{
cout << "wrong mode,mode must be one of GPU,gpu,CPU,cpu" << endl;
}
return 1;
}
int matrix_mul_NN(float *A, int A_ROW, int A_COL, float *B, int B_ROW, int B_COL, float *C, string mode, int maxRow)
{
if (A_COL != B_COL)
{
cout << "matrix A's col is not equal to matrix B's row" << endl;
return 0;
}
__TIC__();
if (B_ROW > maxRow)
{
int ranknumB = ceil((float)B_ROW / (float)maxRow);
for (int j = 0; j < ranknumB; j++)
{
int lastROWB;
if (j < ranknumB - 1)
{
lastROWB = maxRow;
}
else
{
lastROWB = (B_ROW - (maxRow*j));
}
float *B_tmp = (float*)malloc(sizeof(float)*B_COL*lastROWB);
memcpy(B_tmp, B + j*B_COL*maxRow, sizeof(float)*B_COL*lastROWB);
float *C_tmp = (float*)malloc(sizeof(float)*A_ROW*lastROWB);
matrix_mul(A, A_ROW, A_COL, B_tmp, lastROWB, B_COL, C_tmp, mode, maxRow);
memcpy(C + j*A_ROW*maxRow, C_tmp, sizeof(float)*A_ROW*lastROWB);
free(B_tmp);
free(C_tmp);
}
}
else
{
matrix_mul(A, A_ROW, A_COL, B, B_ROW, B_COL, C, mode, maxRow);
}
__TOC__();
return 1;
}
细节不再赘述,The End!
nmslib库:
createIndex,建立整个树形结构,
saveIndex,保存树形结构,
loadIndex,导入已经建好的数据结构,
addDataPoint,addDataPointBatch,动态的增加新的数据
knnQuery,knnQueryBatch,实现最相似的top N数据查找,可以返回top N的索引和相似度得分。
github:https://github.com/nmslib/nmslib
文档:https://nmslib.github.io/nmslib/api.html
安装:pip install nmslib
# coding:utf-8
import numpy as np
import nmslib
import datetime
from functools import wraps
def func_execute_time(func):
@wraps(func)
def wrapper(*args, **kwargs):
start_time = datetime.datetime.now()
res = func(*args, **kwargs)
end_time = datetime.datetime.now()
duration_time = (end_time - start_time).microseconds // 1000
print("execute function %s, elapse time %.4f ms" % (func.__name__, duration_time))
return res
return wrapper
def create_indexer(vec, index_thread, m, ef):
"""
基于数据向量构建索引
:param vec: 原始数据向量
:param index_thread: 线程数
:param m: 参照官网
:param ef: 参照官网
:return:
"""
index = nmslib.init(method="hnsw", space="l2")
index.addDataPointBatch(vec)
INDEX_TIME_PARAMS = {
"indexThreadQty": index_thread,
"M": m,
"efConstruction": ef,
"post": 2
}
index.createIndex(INDEX_TIME_PARAMS, print_progress=True)
index.saveIndex("data_%d_%d_%d.hnsw" % (index_thread, m, ef))
def load_indexer(index_path, ef_search=300):
"""
加载构建好的向量索引文件
:param index_path: 索引文件地址
:param ef_search: 查询结果参数
:return:
"""
indexer = nmslib.init(method="hnsw", space="l2")
indexer.loadIndex(index_path)
indexer.setQueryTimeParams({"efSearch": ef_search})
return indexer
@func_execute_time
def search_vec_top_n(indexer, vecs, top_n=7, threads=4):
"""
使用构建好的索引文件完成向量查询
:param indexer: 索引
:param vecs: 待查询向量
:param top_n: 返回前top_n个查询结果
:param threads:
:return:
"""
neighbours = indexer.knnQuery(vecs, k=top_n)
return neighbours
@func_execute_time
def search_batch_vec_top_n(indexer, vecs, top_n=7, threads=4):
"""
使用构建好的索引文件完成向量查询
:param indexer: 索引
:param vecs: 待查询向量
:param top_n: 返回前top_n个查询结果
:param threads:
:return:
"""
neighbours = indexer.knnQueryBatch(vecs, k=top_n, num_threads=threads)
return neighbours
if __name__ == '__main__':
data = np.arange(300000000).reshape(1000000, 300)
print(data.shape)
create_indexer(data, 10, 5, 300)
indexer = load_indexer("data_10_5_300.hnsw")
print(search_vec_top_n(indexer, data[0].reshape(1,-1)))
print(search_batch_vec_top_n(indexer, data[0].reshape(1,-1)))
print(search_batch_vec_top_n(indexer, data[:10]))