使用weaviate实现向量存储
1 介绍
当下开源的向量数据库比较多,主要分嵌入式向量数据库(数据库和应用在一起)和客户端-服务器模型向量数据库(客户端和服务器分离),比较出名数据库如下:嵌入式包括Chroma、lancedb,客户端和服务器分离的包括:Milvus、Faiss、Qdrant和Weaviate等。本文选择Weaviate。
Weaviate是一个人工智能原生矢量数据库,强调与知识图的灵活高效交互,支持单节点和集群部署,支持关键词检索和语义检索。
Chroma在易用性上占优,适合在没用向量数据库服务器的场景下使用,是音频和视频搜索的理想选择。
Milvus强调在存储效率和数据查询性能上的平衡,适用于向量嵌入、高效相似搜索和AI应用;
Faiss可进行GPU加速的高速检索,擅长处理庞大数据集的快速最近邻搜索,适应用于快速、密集向量相似性搜索和分组。
嵌入式向量数据库
(1)Chroma
# Github地址
https://github.com/chroma-core/chroma
(2)lancedb
https://github.com/lancedb/lancedb
客户端和服务器分离的向量数据库
(1)Milvus
# Github地址
https://github.com/milvus-io/milvus
(2)Faiss
# Github地址
https://github.com/facebookresearch/faiss
(3)weaviate
# Github地址
https://github.com/weaviate/weaviate
# 官网地址
https://weaviate.io/developers/weaviate
# 使用的参考地址
https://weaviate.io/developers/weaviate/quickstart
2 使用Docker安装
参考官网:
https://weaviate.io/developers/weaviate/installation/docker-compose
参数说明
# 设置环境变量:
# 1 开启验证
# 开启连接数据库时验证
AUTHENTICATION_APIKEY_ENABLED: true
# 配置登录数据库的密钥,密钥列表如下,多个密钥对应多个用户
AUTHENTICATION_APIKEY_ALLOWED_KEYS: 'jane-secret-key,ian-secret-key'
# 配置登录数据库的用户,用户列表与上述密钥一一对应
AUTHENTICATION_APIKEY_USERS: 'jane@doe.com,ian-smith'
# 2 开启权限
# 配置登录数据库的用户权限
AUTHORIZATION_ADMINLIST_ENABLED: 'true'
# 具有管理员权限,可读、可写的用户列表,多个用户用英语逗号隔开
AUTHORIZATION_ADMINLIST_USERS: 'jane@doe.com'
# 仅仅具有可读权限的用户列表,多个用户用英文逗号隔开
AUTHORIZATION_ADMINLIST_READONLY_USERS: 'ian-smith'
docker命令安装weaviate
docker run -itd --name=weaviate \
--restart=always \
-p 8080:8080 \
-p 50051:50051 \
-e "AUTHENTICATION_APIKEY_ENABLED=true" \
-e "AUTHENTICATION_APIKEY_ALLOWED_KEYS=jane-secret-key,ian-secret-key" \
-e "AUTHENTICATION_APIKEY_USERS=jane@doe.com,ian-smith" \
-e "AUTHORIZATION_ADMINLIST_ENABLED=true" \
-e "AUTHORIZATION_ADMINLIST_USERS=jane@doe.com" \
-e "AUTHORIZATION_ADMINLIST_READONLY_USERS=ian-smith" \
cr.weaviate.io/semitechnologies/weaviate:1.25.0
对应关系
序号 | 密钥 | 用户 | 权限 |
---|---|---|---|
1 | jane-secret-key | jane@doe.com | 管理员,可读可写 |
2 | ian-secret-key | ian-smith | 可读 |
3 使用Weaviate
3.1 安装环境
⚠️ 下面安装方式适应用客户端版本为4,Weaviate版本是1.23 or higher.
# 安装weaviate
pip install -U weaviate-client -i https://pypi.tuna.tsinghua.edu.cn/simple
# 安装SentenceTransformer
pip install -U sentence-transformers -i https://pypi.tuna.tsinghua.edu.cn/simple
3.2 Python使用Weaviate
# 词嵌入模型
from sentence_transformers import SentenceTransformer
# 使用 weaviate
import weaviate
from weaviate.auth import AuthApiKey
from weaviate.classes.config import Property, DataType
from weaviate.collections.classes.config import Configure
from weaviate.collections.classes.grpc import MetadataQuery
# 使用SentenceTransformer模型
model = SentenceTransformer("E:/model/all-MiniLM-L6-v2")
ip_addr = "192.168.108.200"
# 自定义连接
client = weaviate.connect_to_custom(
http_host=ip_addr,
http_port=8080,
http_secure=False,
grpc_host=ip_addr,
grpc_port=50051,
grpc_secure=False,
# 对应AUTHENTICATION_APIKEY_ALLOWED_KEYS中的密钥
# 注意:此处只需要密钥即可,不需要用户名称
auth_credentials=AuthApiKey("jane-secret-key")
)
collection_name = "article"
def create_collection():
"""
创建索引
:return:
"""
client.collections.create(
name="Article",
# 定义文本属性
properties=[
Property(name="title", data_type=DataType.TEXT),
Property(name="summary", data_type=DataType.TEXT)
],
# 定义向量属性
vectorizer_config=[
# 注意使用text2vec_openai、text2vec_cohere等需要设置连接API,并且官网提示后期将删除
# 例如:vectorizer_config=Configure.Vectorizer.text2vec_openai(),
# 设置不用模型,因为我用的时SentenceTransformer,所以使用none
Configure.NamedVectors.none(name="title")
]
)
def delete_collection():
"""
删除索引
:return:
"""
# 删除索引
client.collections.delete(collection_name)
def query_collection_info():
# 查看读的状态
print(client.is_ready())
# 查看连接状态
print(client.is_connected())
# 查看索引
print(client.collections.list_all())
# 获取索引
article_collection = client.collections.get(collection_name)
# 获取全部数据
# include_vector=True表示显示向量
print(article_collection.query.fetch_objects(include_vector=True))
# 迭代获取数据
for item in article_collection.iterator(include_vector=True):
print(item.properties)
print(item.vector)
def insert_collection():
"""
插入1条数据
:return:
"""
# 获取索引
article_collection = client.collections.get(collection_name)
# 插入1条数据,返回唯一标识uuid
uuid = article_collection.data.insert(
properties={
"title": "A delicious Riesling",
"summary": "This wine is a delicious Riesling which pairs well with seafood."
},
# 添加向量数据
vector={
"title": model.encode("A delicious Riesling"),
"summary": model.encode("This wine is a delicious Riesling which pairs well with seafood.")
}
)
print(uuid)
def batch_collection():
"""
批量插入数据
:return:
"""
data_list = [
{
"title": "A man is eating food.",
"summary": ""
},
{
"title": "A man is eating a piece of bread.",
"summary": ""
},
{
"title": "The girl is carrying a baby.",
"summary": ""
},
{
"title": "A man is riding a horse.",
"summary": ""
},
{
"title": "A woman is playing violin.",
"summary": ""
},
{
"title": "Two men pushed carts through the woods.",
"summary": ""
},
{
"title": "A man is riding a white horse on an enclosed ground.",
"summary": ""
},
{
"title": "A monkey is playing drums.",
"summary": ""
},
{
"title": "A cheetah is running behind its prey.",
"summary": ""
}
]
# 嵌入模型
title_embedding_list = list()
for item in data_list:
# 文本转化为嵌入
title_embedding_list.append(model.encode(item.get("title")))
# 获取索引
collection = client.collections.get(collection_name)
# 添加到数据库中
with collection.batch.dynamic() as batch:
# 循环插入数据
for i, data_row in enumerate(data_list):
batch.add_object(
properties=data_row,
vector= {
"title": title_embedding_list[i]
}
)
def query_vector_collection():
"""
使用向量查询数据
:return:
"""
article_collection = client.collections.get(collection_name)
# 如果使用向量查询中的query.near_text,需要配置转化向量的模型,例如:连接互联网的API。
# 使用向量查询,使用自定义的模型生成向量
response = article_collection.query.near_vector(
# 文本向量
near_vector=model.encode("A man is eating pasta.").tolist(),
# 限制输出的数量
limit=10,
# 设置返回的数据是否显示距离
return_metadata=MetadataQuery(distance=True)
)
# 展示数据
for o in response.objects:
print(o.properties)
print(o.metadata.distance)
def query_keyword_collection():
"""
使用关键词查询数据,此处的关键词查询也叫"BM25 (Best match 25)"或者"sparse vector"。
:return:
"""
article_collection = client.collections.get(collection_name)
# 使用关键词查询
response = article_collection.query.bm25(
query="food",
return_metadata=MetadataQuery(score=True),
limit=3
)
for o in response.objects:
print(o.properties)
print(o.metadata.distance)
if __name__ == '__main__':
# delete_collection()
# 1 创建索引
# create_collection()
# 2 插入数据
# insert_collection()
# 3 批量插入数据
# batch_collection()
# 4 查询索引
# query_collection_info()
# 5 使用向量查询
query_vector_collection()
# 6 使用关键词查询
query_keyword_collection()
# 关闭客户端
client.close()
执行截图