测试使用annoy案例1
from annoy import AnnoyIndex
import random
f = 40
t = AnnoyIndex(f, 'angular') # Length of item vector that will be indexed
for i in range(1000):
v = [random.gauss(0, 1) for z in range(f)]
t.add_item(i, v)
t.build(10) # 10 trees
t.save('test.ann')
# ...
u = AnnoyIndex(f, 'angular')
u.load('test.ann') # super fast, will just mmap the file
print(u.get_nns_by_item(0, 10)) # will find the 1000 nearest neighbors
结果如下
测试使用annoy案例2
a = AnnoyIndex(3, 'euclidean')
b = AnnoyIndex(3 ,'angular')
a.add_item(0,[1,0,0])
a.add_item(1,[0,1,0])
a.add_item(2,[2,0,0])
a.add_item(3,[2.5,0,0])
a.add_item(4,[1,0,0.5])
b.add_item(0,[1,0,0])
b.add_item(1,[0,1,0])
b.add_item(2,[2,0,0])
b.add_item(3,[2.5,0,0])
b.add_item(4,[1,0,0.5])
a.build(1)
b.build(1)
print(a.get_nns_by_item(0, 4))
print(b.get_nns_by_item(0, 4))
普通实现的MNN测试(欧式距离)
#主要的判断方式如下
import numpy as np
from sklearn.neighbors import NearestNeighbors
#import pyreadr
import numpy as np
# from sklearn.neighbors import NearestNeighbors
# x=pyreadr.read_r("x.RData")
# y=pyreadr.read_r("y.RData")
# x=x["x"].values
# y=y["y"].values
np.random.seed(1)#设置随机种子
x=np.random.randn(10,2)# x是二维的
y=np.random.randn(10,2)# y是2维的
#x ndarray,y ndarray
# return ndarray ,先默认是欧式距离,
def findMNN(x,y,k=10):
neigh_y = NearestNeighbors(n_neighbors=k).fit(y)
indice_y=neigh_y.kneighbors(x, return_distance=False)#对数据集x,在y中找它的k最近邻,返回下标
neigh_x = NearestNeighbors(n_neighbors=k).fit(x)
cnt=0;
mnnset=[]
for ind_y in indice_y:
temp=y[ind_y]
indice_x=neigh_x.kneighbors(temp,return_distance=False)
row,col=np.where(indice_x==cnt)
for temp_y in row:
mnnset.append([cnt,ind_y[temp_y]])
#mnnset.add((cnt,ind_y[temp_y]))
cnt=cnt+1
mnn_indice=np.array(mnnset)# 我不想返回indice
print(mnn_indice)# 此处要不要返回下表
#eturn(x[mnnset])
#return(x[mnn_indice[:,0]],y[mnn_indice[:,1]])#目前是返回元组,我觉得我直接返回矩阵算了
# 就是res[0]和res[1]返回的矩阵进行拼接。如果res[0]是5维的,res[1]是5维的,那么合并后就是10维的
res=(x[mnn_indice[:,0]],y[mnn_indice[:,1]])#这个是元组形式
return(np.concatenate((res[0],res[1]),axis=1))# 直接返回了
#首先返回的集合mnn pair的所有集合
#给定anchor_sample,positive_sample,集合x,y,判断这一个样本对是否是mnn pair,我觉得是不是应该直接改成向量的
def quary_xy_mnn(anchor_sample,positive_sample,set_x,set_y,k=20):
res=findMNN(set_x,set_y,k=k);
temp_test=np.concatenate((anchor_sample,positive_sample),axis=0)#这个是一维的,所以不存在axis=1,
return(any((res==temp_test).all(1)))#判断该元素在不在里面
#使用案例,
#i=0;
#j=0;
#print(quary_xy_mnn(x[i],y[j],x,y))
mnnset=findMNN(x,y,k=2)# 这里是欧式距离找的mnn,如果用别的距离不知道可不可以,这个目前是没有什么问题的
#print(mnnset)
# 整体的数据图
import matplotlib.pyplot as plt
plt.figure(figsize=(18,12))
plt.scatter(x[:,0],x[:,1],color="r",s=100)
plt.scatter(y[:,0],y[:,1],color="g",s=100)
for i in range(x.shape[0]):
plt.text(x[i,0], x[i,1], str(i),fontsize=20)
for i in range(y.shape[0]):
plt.text(y[i,0], y[i,1], str(i),fontsize=20)
def connectpoints(x,y,p1,p2):# 现在仅仅画两个点
x1, x2 = x[p1], x[p2]
y1, y2 = y[p1], y[p2]
plt.scatter(x1,y1,color='r',s=150)
plt.scatter(x2,y2,color="g",s=150)
plt.plot([x1,x2],[y1,y2])
for i in range(len(mnnset)):
x=[mnnset[i,0],mnnset[i,2]]
y=[mnnset[i,1],mnnset[i,3]]
connectpoints(x,y,0,1)
#plt.axis('equal')
plt.show()
使用annoy计算(欧式距离)
#from typing import Final
from annoy import AnnoyIndex
KNN = 1
#Exact nearest neighbors search.
def nn(ds1, ds2, knn=KNN, metric_p=2):
# Find nearest neighbors of first dataset.
nn_ = NearestNeighbors(n_neighbors=knn, p=metric_p)
nn_.fit(ds2)
ind = nn_.kneighbors(ds1, return_distance=False)
match = set()
for a, b in zip(range(ds1.shape[0]), ind):
for b_i in b:
match.add((a, b_i))
return match
# Approximate nearest neighbors using locality sensitive hashing.
def nn_approx(ds1, ds2, knn=KNN, metric='euclidean', n_trees=10):
# Build index.
a = AnnoyIndex(ds2.shape[1], metric=metric)
for i in range(ds2.shape[0]):
a.add_item(i, ds2[i, :])
a.build(n_trees)
# Search index.
ind = []
for i in range(ds1.shape[0]):
ind.append(a.get_nns_by_vector(ds1[i, :], knn, search_k=-1))
ind = np.array(ind)
# Match.
match = set()
for a, b in zip(range(ds1.shape[0]), ind):
for b_i in b:
match.add((a, b_i))
return match
# Find mutual nearest neighbors.
def mnn(ds1, ds2, knn=KNN, approx=True):
# Find nearest neighbors in first direction.
if approx:
match1 = nn_approx(ds1, ds2, knn=knn)
else:
match1 = nn(ds1, ds2, knn=knn)
# Find nearest neighbors in second direction.
if approx:
match2 = nn_approx(ds2, ds1, knn=knn)
else:
match2 = nn(ds2, ds1, knn=knn)
# Compute mutual nearest neighbors.
mutual = match1 & set([ (b, a) for a, b in match2 ])
return mutual
np.random.seed(1)#设置随机种子
x=np.random.randn(20,2)# x是二维的
y=np.random.randn(20,2)# y是2维的
mnn_appro=mnn(x,y,knn=2,approx=True)
mnn_precise=mnn(x,y,knn=2,approx=False)
print("=========近似寻找一共找到{}个========".format(len(mnn_appro)))
print(mnn_appro)
print("=========精确寻找一共找到{}个========".format(len(mnn_precise)))
print(mnn_precise)
print("=========两种一共交集有{}个==========".format(len(mnn_appro & mnn_precise)))
#print(len(mnn_appro & mnn_precise))
# 可以看到这个是一致的,现在怎么返回距离的问题。
测试cosine距离
#主要的判断方式如下
import numpy as np
from sklearn.neighbors import NearestNeighbors
#import pyreadr
import numpy as np
# from sklearn.neighbors import NearestNeighbors
# x=pyreadr.read_r("x.RData")
# y=pyreadr.read_r("y.RData")
# x=x["x"].values
# y=y["y"].values
np.random.seed(1)#设置随机种子
x=np.random.randn(100,2)# x是二维的
y=np.random.randn(100,2)# y是2维的
#x ndarray,y ndarray
# return ndarray ,先默认是欧式距离,
def findMNN(x,y,k=100):
neigh_y = NearestNeighbors(n_neighbors=k,metric="cosine").fit(y)
indice_y=neigh_y.kneighbors(x, return_distance=False)#对数据集x,在y中找它的k最近邻,返回下标
neigh_x = NearestNeighbors(n_neighbors=k,metric="cosine").fit(x)
cnt=0;
mnnset=[]
mnn_self_imple=set()
for ind_y in indice_y:
temp=y[ind_y]
indice_x=neigh_x.kneighbors(temp,return_distance=False)
row,col=np.where(indice_x==cnt)
for temp_y in row:
mnnset.append([cnt,ind_y[temp_y]])
mnn_self_imple.add((cnt,ind_y[temp_y]))
#mnnset.add((cnt,ind_y[temp_y]))
cnt=cnt+1
mnn_indice=np.array(mnnset)# 我不想返回indice
#print(mnn_indice)# 此处要不要返回下表
#eturn(x[mnnset])
#return(x[mnn_indice[:,0]],y[mnn_indice[:,1]])#目前是返回元组,我觉得我直接返回矩阵算了
# 就是res[0]和res[1]返回的矩阵进行拼接。如果res[0]是5维的,res[1]是5维的,那么合并后就是10维的
res=(x[mnn_indice[:,0]],y[mnn_indice[:,1]])#这个是元组形式
return(np.concatenate((res[0],res[1]),axis=1),mnn_self_imple)# 直接返回了
#首先返回的集合mnn pair的所有集合
#给定anchor_sample,positive_sample,集合x,y,判断这一个样本对是否是mnn pair,我觉得是不是应该直接改成向量的
def quary_xy_mnn(anchor_sample,positive_sample,set_x,set_y,k=20):
res=findMNN(set_x,set_y,k=k);
temp_test=np.concatenate((anchor_sample,positive_sample),axis=0)#这个是一维的,所以不存在axis=1,
return(any((res==temp_test).all(1)))#判断该元素在不在里面
#使用案例,
#i=0;
#j=0;
#print(quary_xy_mnn(x[i],y[j],x,y))
mnnset,mnn_self_imple=findMNN(x,y,k=2)# 这里是欧式距离找的mnn,如果用别的距离不知道可不可以,这个目前是没有什么问题的
#print(mnnset)
# 整体的数据图
import matplotlib.pyplot as plt
plt.figure(figsize=(18,12))
plt.scatter(x[:,0],x[:,1],color="r",s=100)
plt.scatter(y[:,0],y[:,1],color="g",s=100)
for i in range(x.shape[0]):
plt.text(x[i,0], x[i,1], str(i),fontsize=20)
for i in range(y.shape[0]):
plt.text(y[i,0], y[i,1], str(i),fontsize=20)
def connectpoints(x,y,p1,p2):# 现在仅仅画两个点
x1, x2 = x[p1], x[p2]
y1, y2 = y[p1], y[p2]
plt.scatter(x1,y1,color='r',s=150)
plt.scatter(x2,y2,color="g",s=150)
plt.plot([x1,x2],[y1,y2])
for i in range(len(mnnset)):
x=[mnnset[i,0],mnnset[i,2]]
y=[mnnset[i,1],mnnset[i,3]]
connectpoints(x,y,0,1)
#plt.axis('equal')
plt.show()
结果如下
使用annoy来计算cosine距离
#from typing import Final
from annoy import AnnoyIndex
KNN = 1
#Exact nearest neighbors search.
def nn(ds1, ds2, knn=KNN, metric_p="cosine"):
# Find nearest neighbors of first dataset.
nn_ = NearestNeighbors(n_neighbors=knn, metric=metric_p)
nn_.fit(ds2)
ind = nn_.kneighbors(ds1, return_distance=False)
match = set()
for a, b in zip(range(ds1.shape[0]), ind):
for b_i in b:
match.add((a, b_i))
return match
# Approximate nearest neighbors using locality sensitive hashing.
def nn_approx(ds1, ds2, knn=KNN, metric='angular', n_trees=10):
# Build index.
a = AnnoyIndex(ds2.shape[1], metric=metric)
for i in range(ds2.shape[0]):
a.add_item(i, ds2[i, :])
a.build(n_trees)
# Search index.
ind = []
for i in range(ds1.shape[0]):
ind.append(a.get_nns_by_vector(ds1[i, :], knn, search_k=-1))
ind = np.array(ind)
# Match.
match = set()
for a, b in zip(range(ds1.shape[0]), ind):
for b_i in b:
match.add((a, b_i))
return match
# Find mutual nearest neighbors.
def mnn(ds1, ds2, knn=KNN, approx=True):
# Find nearest neighbors in first direction.
if approx:
match1 = nn_approx(ds1, ds2, knn=knn)
else:
match1 = nn(ds1, ds2, knn=knn)
# Find nearest neighbors in second direction.
if approx:
match2 = nn_approx(ds2, ds1, knn=knn)
else:
match2 = nn(ds2, ds1, knn=knn)
# Compute mutual nearest neighbors.
mutual = match1 & set([ (b, a) for a, b in match2 ])
return mutual
np.random.seed(1)#设置随机种子
x=np.random.randn(100,2)# x是二维的
y=np.random.randn(100,2)# y是2维的
mnn_appro=mnn(x,y,knn=2,approx=True)
mnn_precise=mnn(x,y,knn=2,approx=False)
print("=========近似寻找一共找到{}个========".format(len(mnn_appro)))
print(mnn_appro)
print("=========精确寻找一共找到{}个========".format(len(mnn_precise)))
print(mnn_precise)
print("=========自己实现一共找到{}个========".format(len(mnn_self_imple)))
print(mnn_self_imple)
print("=========三种一共交集有{}个==========".format(len(mnn_appro & mnn_precise & mnn_self_imple)))
#print(len(mnn_appro & mnn_precise))
# 可以看到这个是一致的,现在怎么返回距离的问题。