milvus 2 使用

一些有用的官方参考
  1. 术语表:https://milvus.io/cn/docs/v2.0.0/glossary.md
  2. 布尔表达式语法规则:https://milvus.io/cn/docs/v2.0.0/boolean.md
  3. Field Schema (数据)支持类型:https://milvus.io/cn/docs/v2.0.0/field_schema.md
  4. Build an Index (索引)支持类型 No_1:https://milvus.io/cn/docs/v2.0.0/build_index.md
  5. Build an Index (索引)支持类型 No_2:https://milvus.io/cn/docs/v2.0.0/index_selection.md
  6. 从 DataFrame 生成一个 collection schema 并创建一个 collection:https://milvus.io/cn/docs/v2.0.0/collection_schema.md
  7. 混合搜索查询:https://milvus.io/cn/docs/v2.0.0/hybridsearch.md
  8. 性能优化:https://milvus.io/cn/docs/v1.1.0/performance_faq.md
存储向量 server 服务
import datetime
import pandas as pd
import random
import time

from pymilvus import (
    connections,
    utility,
    FieldSchema, CollectionSchema, DataType,
    Collection,
)

mil_name = "item_dssm_embedding"
hash_num = 64

# 获取 item 向量
df = pd.read_csv('/home/q/milvus_test/data/item_daily.csv', names=['item_name', 'item_city', 'item_index', 'item_embedding'])

# 读取的可能是 str 或 object ,需要转成 float list
df['item_embedding'] = df['item_embedding'].apply(lambda x: [float(i) for i in x])

# 映射 item_city to cate,用来在混合搜索中指定条件查询使用
item_city_list = df['item_city'].drop_duplicates().tolist()
item_city_dict = dict(zip(item_city_list, range(len(item_city_list))))

df['item_city_cate'] = df['item_city'].apply(lambda x: item_city_dict[x])

# 连接集群,这是默认的
connections.connect("default", host="localhost", port="19530")
connections.list_connections()

# 如果已经有了,则删除
if utility.has_collection(mil_name):
    utility.drop_collection(mil_name)

# 增加 fields,item 索引、item 城市枚举值、item 向量
fields = [
    FieldSchema(name="item_index", dtype=DataType.INT64, is_primary=True, auto_id=False),
    FieldSchema(name="item_city_cate", dtype=DataType.INT64),
    FieldSchema(name="item_embedding", dtype=DataType.FLOAT_VECTOR, dim=128)
]

# 创建 collection
schema = CollectionSchema(fields, mil_name)
my_mil = Collection(mil_name, schema)

# maximum partition's number should be limit to 4096
for i in range(hash_num):
    my_mil.create_partition(partition_name='hash_' + str(i), description='hash_' + str(i))

df['item_index'] = df['item_index'].astype(int)
df['item_city_cate'] = df['item_city_cate'].astype(int)

item_index_list = df['item_index'].tolist()
item_city_cate_list = df['item_city_cate'].tolist()
item_embedding_list = df['item_embedding'].tolist()

# 为了避免错误:grpc: received message larger than max (1364009470 vs. 536870912)","grpc_status":8}
# 参考:https://milvus.io/cn/docs/v1.1.0/storage_operation.md,单次插入的数据量不能大于 256 MB
entities_list = [[item_index_list[i:i + hash_num],
                  item_city_cate_list[i:i + hash_num],
                  item_embedding_list[i:i + hash_num]] for i in range(0, len(item_index_list), hash_num)]

for i, each_entities in enumerate(entities_list):
    print(i)
    insert_result = my_mil.insert(data=each_entities, partition_name='hash_' + str(city_hash))

# 创建索引
index = {
    "index_type": "IVF_FLAT",
    "metric_type": "L2",
    "params": {"nlist": 128},
}

my_mil.create_index("item_embedding", index)

# 导入数据
my_mil.load()


'''
# 插入数据
entities = [
    df['item_index'].tolist(),
    df['item_city_cate'].tolist(),
    df['item_embedding'].tolist()
]

# 导入数据
insert_result = my_mil.insert(entities)
my_mil.load()
'''




'''
# 或者直接从 data frame 中插入
# pymilvus.orm.exceptions.CannotInferSchemaException: <CannotInferSchemaException: (code=0, message=Cannot infer schema from empty dataframe.)>
# 需要把 df 的各个列的格式由 object 转成 list、int 等
my_mil, insert_result = Collection.construct_from_dataframe(name=mil_name, dataframe=df, primary_field='item_index', auto_id=False)

# 导入数据
my_mil.load()
'''


'''
# 按照条件删除对应的 entities
ids = [105, 213]
expr = f"item_city_cate in [{ids[0]}, {ids[1]}]"
my_mil.delete(expr)

# 删除 collection
utility.drop_collection(mil_name)
'''

查询向量 client 服务
import datetime
import pandas as pd
import random
import time

from pymilvus import (
    connections,
    utility,
    FieldSchema, CollectionSchema, DataType,
    Collection,
)

mil_name = "item_dssm_embedding"

# 获取 collection 集合
connections.connect(mil_name, host="localhost", port="19530")
connections.list_connections()
my_mil = connections.get_connection(mil_name)

# 被搜索的向量
entities = [[random.random() for _ in range(128)] for _ in range(3000)]
vectors_to_search = entities[-2:]

# 搜索参数
search_params = {
    "metric_type": "l2",
    "params": {"nprobe": 10},
}

# 1. search 方法,查找与候选向量最相近的向量
start_time = time.time()
result = my_mil.search(collection_name=mil_name,
                       data=vectors_to_search,
                       anns_field="item_embedding",
                       partition_names=['hash_10'],
                       param=search_params,
                       limit=3,
                       output_fields=["item_city_cate"])
cost_time = time.time() - start_time

# 打印结果
for hits in result:
    for hit in hits:
        print(f"hit: {hit}, item_city_cate field: {hit.entity.get('item_city_cate')}")


# 2. query 方法,取出所有满足 expr 表达式的向量
start_time = time.time()
result = my_mil.query(collection_name=mil_name,
                      expr="item_city_cate > 104",
                      output_fields=["item_city_cate", "item_embedding"])
cost_time = time.time() - start_time

print(f"query result:\n-{result[0]}")




# 3. 带条件的 search 方法,查找与候选向量最相近的向量(限定条件 expr 内)
start_time = time.time()
result = my_mil.search(collection_name=mil_name,
                       data=vectors_to_search,
                       anns_field="item_embedding",
                       param=search_params,
                       limit=3,
                       expression="item_city_cate > 504",
                       output_fields=["item_city_cate"])

for hits in result:
    for hit in hits:
        print(f"hit: {hit}, item_city_cate field: {hit.entity.get('item_city_cate')}")

cost_time = time.time() - start_time



# 单独搜索一个向量,返回 top3 最相似
result = my_mil.search(collection_name=mil_name,
                       data=[vectors_to_search[0]],
                       anns_field="item_embedding",
                       param=search_params,
                       limit=3,
                       expression="item_city_cate > 504",
                       output_fields=["item_city_cate"])

# 存储结果
result_list = []
for hits in result:
    for hit in hits:
        each_dict = {'=index': hit.id,
                     'distance': hit.distance,
                     'item_city_cate': hit.entity.get('item_city_cate')}
        result_list.append(each_dict)


'''
[{'=index': 4674, 'distance': 47.85670471191406, 'item_city_cate': 544},
{'=index': 1936, 'distance': 48.49840545654297, 'item_city_cate': 774}, 
{'=index': 5095, 'distance': 48.49840545654297, 'item_city_cate': 774}]
'''


  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值