一些有用的官方参考
术语表:https://milvus.io/cn/docs/v2.0.0/glossary.md 布尔表达式语法规则:https://milvus.io/cn/docs/v2.0.0/boolean.md Field Schema (数据)支持类型:https://milvus.io/cn/docs/v2.0.0/field_schema.md Build an Index (索引)支持类型 No_1:https://milvus.io/cn/docs/v2.0.0/build_index.md Build an Index (索引)支持类型 No_2:https://milvus.io/cn/docs/v2.0.0/index_selection.md 从 DataFrame 生成一个 collection schema 并创建一个 collection:https://milvus.io/cn/docs/v2.0.0/collection_schema.md 混合搜索查询:https://milvus.io/cn/docs/v2.0.0/hybridsearch.md 性能优化:https://milvus.io/cn/docs/v1.1.0/performance_faq.md
存储向量 server 服务
import datetime
import pandas as pd
import random
import time
from pymilvus import (
connections,
utility,
FieldSchema, CollectionSchema, DataType,
Collection,
)
mil_name = "item_dssm_embedding"
hash_num = 64
df = pd. read_csv( '/home/q/milvus_test/data/item_daily.csv' , names= [ 'item_name' , 'item_city' , 'item_index' , 'item_embedding' ] )
df[ 'item_embedding' ] = df[ 'item_embedding' ] . apply ( lambda x: [ float ( i) for i in x] )
item_city_list = df[ 'item_city' ] . drop_duplicates( ) . tolist( )
item_city_dict = dict ( zip ( item_city_list, range ( len ( item_city_list) ) ) )
df[ 'item_city_cate' ] = df[ 'item_city' ] . apply ( lambda x: item_city_dict[ x] )
connections. connect( "default" , host= "localhost" , port= "19530" )
connections. list_connections( )
if utility. has_collection( mil_name) :
utility. drop_collection( mil_name)
fields = [
FieldSchema( name= "item_index" , dtype= DataType. INT64, is_primary= True , auto_id= False ) ,
FieldSchema( name= "item_city_cate" , dtype= DataType. INT64) ,
FieldSchema( name= "item_embedding" , dtype= DataType. FLOAT_VECTOR, dim= 128 )
]
schema = CollectionSchema( fields, mil_name)
my_mil = Collection( mil_name, schema)
for i in range ( hash_num) :
my_mil. create_partition( partition_name= 'hash_' + str ( i) , description= 'hash_' + str ( i) )
df[ 'item_index' ] = df[ 'item_index' ] . astype( int )
df[ 'item_city_cate' ] = df[ 'item_city_cate' ] . astype( int )
item_index_list = df[ 'item_index' ] . tolist( )
item_city_cate_list = df[ 'item_city_cate' ] . tolist( )
item_embedding_list = df[ 'item_embedding' ] . tolist( )
entities_list = [ [ item_index_list[ i: i + hash_num] ,
item_city_cate_list[ i: i + hash_num] ,
item_embedding_list[ i: i + hash_num] ] for i in range ( 0 , len ( item_index_list) , hash_num) ]
for i, each_entities in enumerate ( entities_list) :
print ( i)
insert_result = my_mil. insert( data= each_entities, partition_name= 'hash_' + str ( city_hash) )
index = {
"index_type" : "IVF_FLAT" ,
"metric_type" : "L2" ,
"params" : { "nlist" : 128 } ,
}
my_mil. create_index( "item_embedding" , index)
my_mil. load( )
'''
# 插入数据
entities = [
df['item_index'].tolist(),
df['item_city_cate'].tolist(),
df['item_embedding'].tolist()
]
# 导入数据
insert_result = my_mil.insert(entities)
my_mil.load()
'''
'''
# 或者直接从 data frame 中插入
# pymilvus.orm.exceptions.CannotInferSchemaException: <CannotInferSchemaException: (code=0, message=Cannot infer schema from empty dataframe.)>
# 需要把 df 的各个列的格式由 object 转成 list、int 等
my_mil, insert_result = Collection.construct_from_dataframe(name=mil_name, dataframe=df, primary_field='item_index', auto_id=False)
# 导入数据
my_mil.load()
'''
'''
# 按照条件删除对应的 entities
ids = [105, 213]
expr = f"item_city_cate in [{ids[0]}, {ids[1]}]"
my_mil.delete(expr)
# 删除 collection
utility.drop_collection(mil_name)
'''
查询向量 client 服务
import datetime
import pandas as pd
import random
import time
from pymilvus import (
connections,
utility,
FieldSchema, CollectionSchema, DataType,
Collection,
)
mil_name = "item_dssm_embedding"
connections. connect( mil_name, host= "localhost" , port= "19530" )
connections. list_connections( )
my_mil = connections. get_connection( mil_name)
entities = [ [ random. random( ) for _ in range ( 128 ) ] for _ in range ( 3000 ) ]
vectors_to_search = entities[ - 2 : ]
search_params = {
"metric_type" : "l2" ,
"params" : { "nprobe" : 10 } ,
}
start_time = time. time( )
result = my_mil. search( collection_name= mil_name,
data= vectors_to_search,
anns_field= "item_embedding" ,
partition_names= [ 'hash_10' ] ,
param= search_params,
limit= 3 ,
output_fields= [ "item_city_cate" ] )
cost_time = time. time( ) - start_time
for hits in result:
for hit in hits:
print ( f"hit: { hit} , item_city_cate field: { hit. entity. get( 'item_city_cate' ) } " )
start_time = time. time( )
result = my_mil. query( collection_name= mil_name,
expr= "item_city_cate > 104" ,
output_fields= [ "item_city_cate" , "item_embedding" ] )
cost_time = time. time( ) - start_time
print ( f"query result:\n- { result[ 0 ] } " )
start_time = time. time( )
result = my_mil. search( collection_name= mil_name,
data= vectors_to_search,
anns_field= "item_embedding" ,
param= search_params,
limit= 3 ,
expression= "item_city_cate > 504" ,
output_fields= [ "item_city_cate" ] )
for hits in result:
for hit in hits:
print ( f"hit: { hit} , item_city_cate field: { hit. entity. get( 'item_city_cate' ) } " )
cost_time = time. time( ) - start_time
result = my_mil. search( collection_name= mil_name,
data= [ vectors_to_search[ 0 ] ] ,
anns_field= "item_embedding" ,
param= search_params,
limit= 3 ,
expression= "item_city_cate > 504" ,
output_fields= [ "item_city_cate" ] )
result_list = [ ]
for hits in result:
for hit in hits:
each_dict = { '=index' : hit. id ,
'distance' : hit. distance,
'item_city_cate' : hit. entity. get( 'item_city_cate' ) }
result_list. append( each_dict)
'''
[{'=index': 4674, 'distance': 47.85670471191406, 'item_city_cate': 544},
{'=index': 1936, 'distance': 48.49840545654297, 'item_city_cate': 774},
{'=index': 5095, 'distance': 48.49840545654297, 'item_city_cate': 774}]
'''