https://github.com/nmslib/hnsw
https://github.com/nmslib/nmslib
pip install nmslib
这个会自动安装 pip install pybind11,结果报错:没有安装成功
装这个成功了:
pip install nmslib==1.7
nmslib.cc(16): fatal error C1083: 无法打开包括文件: “pybind11/pybind11.h”: No such file or directory
error: command 'C:\\Program Files (x86)\\Microsoft Visual Studio 14.0\\VC\\BIN\\x86_amd64\\cl.exe' failed with exit status 2
HNSW
使用demo:
Example Usage
import nmslib
import numpy
# create a random matrix to index
data = numpy.random.randn(10000, 100).astype(numpy.float32)
# initialize a new index, using a HNSW index on Cosine Similarity
index = nmslib.init(method='hnsw', space='cosinesimil')
index.addDataPointBatch(data)
index.createIndex({'post': 2}, print_progress=True)
# query for the nearest neighbours of the first datapoint
ids, distances = index.knnQuery(data[0], k=10)
# get all nearest neighbours for all the datapoint
# using a pool of 4 threads to compute
neighbours = index.knnQueryBatch(data, k=10, num_threads=4)
来源: Python bindings for NMSLIB — nmslib 2.0.5 documentation
pybind demo下载:
https://github.com/5455945/cpp_demo
测试效果:1w条数据,查询不需要时间,创建索引需要30ms左右import datetime
import numpy as np
import logging
# logging.basicConfig(level=logging.INFO)
import nmslib
for i in range(2):
# create a random matrix to index
data = np.random.randn(200, 128).astype(np.float32)
time1=datetime.datetime.now()
# initialize a new index, using a HNSW index on Cosine Similarity
index = nmslib.init(method='hnsw', space='cosinesimil')
index.addDataPointBatch(data)
index.createIndex({'post': 2}, print_progress=False)
print("init", (datetime.datetime.now() - time1).microseconds)
time1 = datetime.datetime.now()
index.addDataPointBatch(data)
index.createIndex({'post': 2}, print_progress=False)
print("add data", (datetime.datetime.now() - time1).microseconds)
time1 = datetime.datetime.now()
index.createIndex({'post': 2}, print_progress=False)
print("createIndex", (datetime.datetime.now() - time1).microseconds)
time1 = datetime.datetime.now()
# query for the nearest neighbours of the first datapoint
ids, distances = index.knnQuery(data[0], k=150)
print("time2",len(ids))
print(ids,distances)
# get all nearest neighbours for all the datapoint
# using a pool of 4 threads to compute
# neighbours = index.knnQueryBatch(data, k=10, num_threads=4)
# print(neighbours)