Vearch(https://github.com/vearch/vearch) 是一个可以在海量特征中快速检索出相似结果的弹性分布式系统。具体的介绍可以参考vearch的官方文档https://vearch.readthedocs.io/zh_CN/latest/overview.html
向量检索系统Vearch 之从零开始源码编译安装
1、安装vearch的依赖
CentOS, Ubuntu and Mac OS are all OK (recommend CentOS >= 7.2),cmake required
Go >= 1.11.2 required
Gcc >= 5 required
# faiss是vearch引擎的依赖,是必须安装的
Faiss >= v1.6.0
# RocksDB是vearch磁盘版数据的存储引擎
RocksDB == 6.2.2 (optional) .
#这个是要源码编译python sdk的时候需要的,这里建议直接使用pip install vearch
swig >= 3
# 关于GPU,我准备单独一个文章介绍,这里就先略过
CUDA >= 9.0, if you want GPU support.
所以这里我们准备安装的就是 go、gcc、 faiss和rocksdb
# 首先定义一个vearch的目录,这里暂定为/home/vearch,接下的操作都将在这个目录下
mkdir -p /home/vearch && cd /home/vearch
yum install golang
yum install gcc
#安装rocksdb参考https://github.com/facebook/rocksdb/blob/master/INSTALL.md
#安装faiss参考https://github.com/facebookresearch/faiss/blob/master/INSTALL.md
# 安装完rocksdb和faiss将其中的so包和include文件夹挪到对应的位置
# 创建vearch的依赖文件夹vearch_libs
mkdir vearch_libs && cd vearch_libs
# 对于rocksdb,这里下载的是rocksdb-6.2.2
git clone https://github.com/facebook/rocksdb.git
cd rocksdb && make shared_lib
# 拷贝相关文件到指定文件夹
mkdir -p /home/vearch/vearch_libs/rocksdb-6.2.2-install/lib/
cp librocksdb.so librocksdb.so.6 librocksdb.so.6.2 librocksdb.so.6.2.2 /home/vearch/vearch_libs/rocksdb-6.2.2-install/lib/
cp -r include /home/vearch/vearch_libs/rocksdb-6.2.2-install/
# 对于faiss
git clone https://github.com/facebookresearch/faiss.git
cd faiss && ./configure --without-cuda && make
# 拷贝相关文件到指定文件夹
mkdir -p /home/vearch/vearch_libs/faiss-1.6.0-install/lib
cp libfaiss.a libfaiss.so /home/vearch/vearch_libs/faiss-1.6.0-install/lib
mkdir -p /home/vearch/vearch_libs/faiss-1.6.0-install/include/faiss/
cp *.h /home/vearch/vearch_libs/faiss-1.6.0-install/include/faiss/
mkdir -p /home/vearch/vearch_libs/faiss-1.6.0-install/include/faiss/impl
cp impl/*.h /home/vearch/vearch_libs/faiss-1.6.0-install/include/faiss/impl
mkdir -p /home/vearch/vearch_libs/faiss-1.6.0-install/include/faiss/util
cp util/*.h /home/vearch/vearch_libs/faiss-1.6.0-install/include/faiss/util
2、编译vearch
# 定义vearch的编译路径,这个最好不要改
mkdir -p /home/vearch/go/src/github/vearch
export GOPATH=/home/vearch/go
cd /home/vearch/go/src/github/vearch
git clone https://github.com/vearch/vearch.git
# 过程有点慢,需要细心等待,或者直接下载zip包,然后本地解压
# unzip vearch_master.zip && mv vearch_master vearch && rm vearch_master.zip
cd vearch/build
# 在这里要分别编译引擎和vearch,为了方便这里写了个脚本run.sh
# export GOPATH=/home/vearch/go
# export FAISS_HOME=/home/vearch/vearch_libs/faiss-1.6.0-install/
# export ROCKSDB_HOME=/home/vearch/vearch_libs/rocksdb-6.2.2-install/
# export LD_LIBRARY_PATH=$FAISS_HOME/lib:$ROCKSDB_HOME/lib:$LD_LIBRARY_PATH
# ./build.sh
bash run.sh
如果顺利,你就完成了vearch的编译.
3、部署vearch
3.1、部署单机版vearch
1、定义部署文件夹
mkdir -p /home/vearch/deploy && cd /home/vearch/deploy
cp /home/vearch/go/src/github/vearch/vearch/build/bin/* ./
mkdir -p /home/vearch/deploy && cd /home/vearch/deploy/copy/lib
cp /home/vearch/go/src/github/vearch/vearch/build/gamma_build/libgamma.so* ./
cp /home/vearch/vearch_libs/faiss-1.6.0-install/lib/* ./
cp /home/vearch/vearch_libs/rocksdb-6.2.2-install/lib/* ./
vim conf.toml # 内容如下
2、编写conf.toml文件
[global]
# the name will validate join cluster by same name
name = "vearch"
# you data save to disk path ,If you are in a production environment, You'd better set absolute paths
data = ["/home/vearch/Data/baud/datas/"]
# log path , If you are in a production environment, You'd better set absolute paths
log = "/home/vearch/Data/baud/logs/"
# default log type for any model
level = "debug"
# master <-> ps <-> router will use this key to send or receive data
signkey = "vearch"
skip_auth = true
# if you are master you'd better set all config for router and ps and router and ps use default config it so cool
[[masters]]
# name machine name for cluster
name = "master1"
# ip or domain
address = "127.0.0.1"
# api port for http server
api_port = 8817
# port for etcd server
etcd_port = 2378
# listen_peer_urls List of comma separated URLs to listen on for peer traffic.
# advertise_peer_urls List of this member's peer URLs to advertise to the rest of the cluster. The URLs needed to be a comma-separated list.
etcd_peer_port = 2390
# List of this member's client URLs to advertise to the public.
# The URLs needed to be a comma-separated list.
# advertise_client_urls AND listen_client_urls
etcd_client_port = 2370
skip_auth = true
[router]
# port for server
port = 9001
# skip auth for client visit data
skip_auth = true
[ps]
# port for server
rpc_port = 8081
# raft config begin
raft_heartbeat_port = 8898
raft_replicate_port = 8899
heartbeat-interval = 200 #ms
raft_retain_logs = 10000
raft_replica_concurrency = 1
raft_snap_concurrency = 1
编写完配置文件就可以直接运行了,再加上两个脚本
start.sh
#!/usr/bin/env bash
BasePath=$(cd `dirname $0`; pwd)
cd $BasePath
function getServiceStatusInfo {
pidFile=$1
filterTag=