annoy学习总结

最新推荐文章于 2024-02-01 20:00:00 发布

我的心永远是冰冰哒

最新推荐文章于 2024-02-01 20:00:00 发布

阅读量230

点赞数

文章标签：学习

本文链接：https://blog.csdn.net/qq_45759229/article/details/130471478

版权

测试使用annoy案例1

from annoy import AnnoyIndex
import random

f = 40
t = AnnoyIndex(f, 'angular')  # Length of item vector that will be indexed
for i in range(1000):
    v = [random.gauss(0, 1) for z in range(f)]
    t.add_item(i, v)

t.build(10) # 10 trees
t.save('test.ann')

# ...

u = AnnoyIndex(f, 'angular')
u.load('test.ann') # super fast, will just mmap the file
print(u.get_nns_by_item(0, 10)) # will find the 1000 nearest neighbors

结果如下
在这里插入图片描述

测试使用annoy案例2

a = AnnoyIndex(3, 'euclidean')
b = AnnoyIndex(3 ,'angular')

a.add_item(0,[1,0,0])
a.add_item(1,[0,1,0])
a.add_item(2,[2,0,0])
a.add_item(3,[2.5,0,0])
a.add_item(4,[1,0,0.5])

b.add_item(0,[1,0,0]) 
b.add_item(1,[0,1,0])
b.add_item(2,[2,0,0])
b.add_item(3,[2.5,0,0])
b.add_item(4,[1,0,0.5])

a.build(1)
b.build(1)

print(a.get_nns_by_item(0, 4))
print(b.get_nns_by_item(0, 4))

在这里插入图片描述

普通实现的MNN测试(欧式距离）

#主要的判断方式如下
import numpy as np
from sklearn.neighbors import NearestNeighbors
#import pyreadr
import numpy as np
# from sklearn.neighbors import NearestNeighbors
# x=pyreadr.read_r("x.RData")
# y=pyreadr.read_r("y.RData")
# x=x["x"].values
# y=y["y"].values
np.random.seed(1)#设置随机种子
x=np.random.randn(10,2)# x是二维的
y=np.random.randn(10,2)# y是2维的
#x ndarray,y ndarray
# return ndarray ,先默认是欧式距离，
def findMNN(x,y,k=10):
    neigh_y = NearestNeighbors(n_neighbors=k).fit(y)
    indice_y=neigh_y.kneighbors(x, return_distance=False)#对数据集x,在y中找它的k最近邻，返回下标
    neigh_x = NearestNeighbors(n_neighbors=k).fit(x)
    cnt=0;
    mnnset=[]
    for ind_y in indice_y:
        temp=y[ind_y]
        indice_x=neigh_x.kneighbors(temp,return_distance=False)
        row,col=np.where(indice_x==cnt)
        for temp_y in row:
            mnnset.append([cnt,ind_y[temp_y]])
            #mnnset.add((cnt,ind_y[temp_y]))
        cnt=cnt+1
    mnn_indice=np.array(mnnset)# 我不想返回indice
    print(mnn_indice)# 此处要不要返回下表
    #eturn(x[mnnset])
    #return(x[mnn_indice[:,0]],y[mnn_indice[:,1]])#目前是返回元组，我觉得我直接返回矩阵算了
    # 就是res[0]和res[1]返回的矩阵进行拼接。如果res[0]是5维的，res[1]是5维的，那么合并后就是10维的
    res=(x[mnn_indice[:,0]],y[mnn_indice[:,1]])#这个是元组形式
    return(np.concatenate((res[0],res[1]),axis=1))# 直接返回了
#首先返回的集合mnn pair的所有集合

#给定anchor_sample,positive_sample,集合x,y,判断这一个样本对是否是mnn pair，我觉得是不是应该直接改成向量的
def quary_xy_mnn(anchor_sample,positive_sample,set_x,set_y,k=20):
    res=findMNN(set_x,set_y,k=k);
    temp_test=np.concatenate((anchor_sample,positive_sample),axis=0)#这个是一维的，所以不存在axis=1，
    return(any((res==temp_test).all(1)))#判断该元素在不在里面

#使用案例，
#i=0;
#j=0;
#print(quary_xy_mnn(x[i],y[j],x,y))
mnnset=findMNN(x,y,k=2)# 这里是欧式距离找的mnn,如果用别的距离不知道可不可以，这个目前是没有什么问题的
#print(mnnset)
# 整体的数据图
import matplotlib.pyplot as plt
plt.figure(figsize=(18,12))
plt.scatter(x[:,0],x[:,1],color="r",s=100)
plt.scatter(y[:,0],y[:,1],color="g",s=100)

for i in range(x.shape[0]):
    plt.text(x[i,0], x[i,1], str(i),fontsize=20)
for i in range(y.shape[0]):
    plt.text(y[i,0], y[i,1], str(i),fontsize=20)    
    

def connectpoints(x,y,p1,p2):# 现在仅仅画两个点
    x1, x2 = x[p1], x[p2]
    y1, y2 = y[p1], y[p2]
    plt.scatter(x1,y1,color='r',s=150)
    plt.scatter(x2,y2,color="g",s=150)
    plt.plot([x1,x2],[y1,y2])

for i in range(len(mnnset)):
    x=[mnnset[i,0],mnnset[i,2]]
    y=[mnnset[i,1],mnnset[i,3]]
    connectpoints(x,y,0,1)
#plt.axis('equal')
plt.show()

在这里插入图片描述

使用annoy计算（欧式距离）

#from typing import Final
from annoy import AnnoyIndex
KNN = 1
#Exact nearest neighbors search.
def nn(ds1, ds2, knn=KNN, metric_p=2):
    # Find nearest neighbors of first dataset.
    nn_ = NearestNeighbors(n_neighbors=knn, p=metric_p)
    nn_.fit(ds2)
    ind = nn_.kneighbors(ds1, return_distance=False)

    match = set()
    for a, b in zip(range(ds1.shape[0]), ind):
        for b_i in b:
            match.add((a, b_i))

    return match

# Approximate nearest neighbors using locality sensitive hashing.
def nn_approx(ds1, ds2, knn=KNN, metric='euclidean', n_trees=10):
    # Build index.
    a = AnnoyIndex(ds2.shape[1], metric=metric)
    for i in range(ds2.shape[0]):
        a.add_item(i, ds2[i, :])
    a.build(n_trees)

    # Search index.
    ind = []
    for i in range(ds1.shape[0]):
        ind.append(a.get_nns_by_vector(ds1[i, :], knn, search_k=-1))
    ind = np.array(ind)

    # Match.
    match = set()
    for a, b in zip(range(ds1.shape[0]), ind):
        for b_i in b:
            match.add((a, b_i))
            
    return match

# Find mutual nearest neighbors.
def mnn(ds1, ds2, knn=KNN, approx=True):
    # Find nearest neighbors in first direction.
    if approx:
        match1 = nn_approx(ds1, ds2, knn=knn)
    else:
        match1 = nn(ds1, ds2, knn=knn)

    # Find nearest neighbors in second direction.
    if approx:
        match2 = nn_approx(ds2, ds1, knn=knn)
    else:
        match2 = nn(ds2, ds1, knn=knn)

    # Compute mutual nearest neighbors.
    mutual = match1 & set([ (b, a) for a, b in match2 ])

    return mutual

np.random.seed(1)#设置随机种子
x=np.random.randn(20,2)# x是二维的
y=np.random.randn(20,2)# y是2维的
mnn_appro=mnn(x,y,knn=2,approx=True)
mnn_precise=mnn(x,y,knn=2,approx=False)
print("=========近似寻找一共找到{}个========".format(len(mnn_appro)))
print(mnn_appro)
print("=========精确寻找一共找到{}个========".format(len(mnn_precise)))
print(mnn_precise)
print("=========两种一共交集有{}个==========".format(len(mnn_appro & mnn_precise)))
#print(len(mnn_appro & mnn_precise))

# 可以看到这个是一致的，现在怎么返回距离的问题。

在这里插入图片描述

测试cosine距离

#主要的判断方式如下
import numpy as np
from sklearn.neighbors import NearestNeighbors
#import pyreadr
import numpy as np
# from sklearn.neighbors import NearestNeighbors
# x=pyreadr.read_r("x.RData")
# y=pyreadr.read_r("y.RData")
# x=x["x"].values
# y=y["y"].values
np.random.seed(1)#设置随机种子
x=np.random.randn(100,2)# x是二维的
y=np.random.randn(100,2)# y是2维的
#x ndarray,y ndarray
# return ndarray ,先默认是欧式距离，
def findMNN(x,y,k=100):
    neigh_y = NearestNeighbors(n_neighbors=k,metric="cosine").fit(y)
    indice_y=neigh_y.kneighbors(x, return_distance=False)#对数据集x,在y中找它的k最近邻，返回下标
    neigh_x = NearestNeighbors(n_neighbors=k,metric="cosine").fit(x)
    cnt=0;
    mnnset=[]
    mnn_self_imple=set()
    for ind_y in indice_y:
        temp=y[ind_y]
        indice_x=neigh_x.kneighbors(temp,return_distance=False)
        row,col=np.where(indice_x==cnt)
        for temp_y in row:
            mnnset.append([cnt,ind_y[temp_y]])
            mnn_self_imple.add((cnt,ind_y[temp_y]))
            #mnnset.add((cnt,ind_y[temp_y]))
        cnt=cnt+1
    mnn_indice=np.array(mnnset)# 我不想返回indice
    #print(mnn_indice)# 此处要不要返回下表
    #eturn(x[mnnset])
    #return(x[mnn_indice[:,0]],y[mnn_indice[:,1]])#目前是返回元组，我觉得我直接返回矩阵算了
    # 就是res[0]和res[1]返回的矩阵进行拼接。如果res[0]是5维的，res[1]是5维的，那么合并后就是10维的
    res=(x[mnn_indice[:,0]],y[mnn_indice[:,1]])#这个是元组形式
    return(np.concatenate((res[0],res[1]),axis=1),mnn_self_imple)# 直接返回了
#首先返回的集合mnn pair的所有集合

#给定anchor_sample,positive_sample,集合x,y,判断这一个样本对是否是mnn pair，我觉得是不是应该直接改成向量的
def quary_xy_mnn(anchor_sample,positive_sample,set_x,set_y,k=20):
    res=findMNN(set_x,set_y,k=k);
    temp_test=np.concatenate((anchor_sample,positive_sample),axis=0)#这个是一维的，所以不存在axis=1，
    return(any((res==temp_test).all(1)))#判断该元素在不在里面

#使用案例，
#i=0;
#j=0;
#print(quary_xy_mnn(x[i],y[j],x,y))
mnnset,mnn_self_imple=findMNN(x,y,k=2)# 这里是欧式距离找的mnn,如果用别的距离不知道可不可以，这个目前是没有什么问题的
#print(mnnset)
# 整体的数据图
import matplotlib.pyplot as plt
plt.figure(figsize=(18,12))
plt.scatter(x[:,0],x[:,1],color="r",s=100)
plt.scatter(y[:,0],y[:,1],color="g",s=100)

for i in range(x.shape[0]):
    plt.text(x[i,0], x[i,1], str(i),fontsize=20)
for i in range(y.shape[0]):
    plt.text(y[i,0], y[i,1], str(i),fontsize=20)    
    

def connectpoints(x,y,p1,p2):# 现在仅仅画两个点
    x1, x2 = x[p1], x[p2]
    y1, y2 = y[p1], y[p2]
    plt.scatter(x1,y1,color='r',s=150)
    plt.scatter(x2,y2,color="g",s=150)
    plt.plot([x1,x2],[y1,y2])

for i in range(len(mnnset)):
    x=[mnnset[i,0],mnnset[i,2]]
    y=[mnnset[i,1],mnnset[i,3]]
    connectpoints(x,y,0,1)
#plt.axis('equal')
plt.show()

结果如下
在这里插入图片描述

使用annoy来计算cosine距离



#from typing import Final
from annoy import AnnoyIndex
KNN = 1
#Exact nearest neighbors search.
def nn(ds1, ds2, knn=KNN, metric_p="cosine"):
    # Find nearest neighbors of first dataset.
    nn_ = NearestNeighbors(n_neighbors=knn, metric=metric_p)
    nn_.fit(ds2)
    ind = nn_.kneighbors(ds1, return_distance=False)

    match = set()
    for a, b in zip(range(ds1.shape[0]), ind):
        for b_i in b:
            match.add((a, b_i))

    return match

# Approximate nearest neighbors using locality sensitive hashing.
def nn_approx(ds1, ds2, knn=KNN, metric='angular', n_trees=10):
    # Build index.
    a = AnnoyIndex(ds2.shape[1], metric=metric)
    for i in range(ds2.shape[0]):
        a.add_item(i, ds2[i, :])
    a.build(n_trees)

    # Search index.
    ind = []
    for i in range(ds1.shape[0]):
        ind.append(a.get_nns_by_vector(ds1[i, :], knn, search_k=-1))
    ind = np.array(ind)

    # Match.
    match = set()
    for a, b in zip(range(ds1.shape[0]), ind):
        for b_i in b:
            match.add((a, b_i))
            
    return match

# Find mutual nearest neighbors.
def mnn(ds1, ds2, knn=KNN, approx=True):
    # Find nearest neighbors in first direction.
    if approx:
        match1 = nn_approx(ds1, ds2, knn=knn)
    else:
        match1 = nn(ds1, ds2, knn=knn)

    # Find nearest neighbors in second direction.
    if approx:
        match2 = nn_approx(ds2, ds1, knn=knn)
    else:
        match2 = nn(ds2, ds1, knn=knn)

    # Compute mutual nearest neighbors.
    mutual = match1 & set([ (b, a) for a, b in match2 ])

    return mutual

np.random.seed(1)#设置随机种子
x=np.random.randn(100,2)# x是二维的
y=np.random.randn(100,2)# y是2维的
mnn_appro=mnn(x,y,knn=2,approx=True)
mnn_precise=mnn(x,y,knn=2,approx=False)
print("=========近似寻找一共找到{}个========".format(len(mnn_appro)))
print(mnn_appro)
print("=========精确寻找一共找到{}个========".format(len(mnn_precise)))
print(mnn_precise)
print("=========自己实现一共找到{}个========".format(len(mnn_self_imple)))
print(mnn_self_imple)
print("=========三种一共交集有{}个==========".format(len(mnn_appro & mnn_precise & mnn_self_imple)))
#print(len(mnn_appro & mnn_precise))

# 可以看到这个是一致的，现在怎么返回距离的问题。

在这里插入图片描述