向量数据库weaviate,Python Client v4一些简单使用
1.连接向量数据库weaviate
注意连接方法很多,可以参考:https://weaviate.io/developers/weaviate/connections/connect-custom
import weaviate
from weaviate.auth import AuthApiKey
client = weaviate.connect_to_local(
auth_credentials=AuthApiKey("test-secret-key")
)
# 自定义连接
client = weaviate.connect_to_custom(
skip_init_checks=False,
http_host="127.0.0.1",
http_port=8080,
http_secure=False,
grpc_host="127.0.0.1",
grpc_port=50051,
grpc_secure=False,
# 对应AUTHENTICATION_APIKEY_ALLOWED_KEYS中的密钥
# 注意:此处只需要密钥即可,不需要用户名称
auth_credentials=AuthApiKey("test-secret-key")
)
print(client.is_ready())
print(client.close())
如果没有安装,可以参考:向量数据库weaviate安装和部署
2.Python Client v4检查collection是否存在
注意:这是我自己写的,官网并没有提供检查collection是否存在的方法
def check_index_exists(client: WeaviateClient, collection_name: str) -> bool:
"""
检查索引是否存在
:param client: 连接
:param collection_name: 索引名
:return: True或者False
"""
try:
collections = client.collections.list_all()
# 检查 collection_name 是否存在于集合列表中
collection_names = [c for c in collections]
return collection_name in collection_names
except Exception as e:
print(f"检查索引异常: {e}")
return False
3.Python Client v4 collection不在,创建collection
注意:value存储形式是文本,key是vector
def create_collection(client: WeaviateClient, collection_name: str):
collection_obj = {
"class": collection_name,
"description": "A collection for product information",
"invertedIndexConfig": {
"bm25": {
"b": 0.75,
"k1": 1.2
},
"stopwords": {
"preset": "en",
"additions": ["example"],
"removals": ["a", "the"]
},
"indexTimestamps": True,
"indexNullState": True,
"indexPropertyLength": True
},
"vectorizer": "none", # Assuming you want to upload your own vectors
"vectorIndexType": "hnsw",
"vectorIndexConfig": {
"distance": "cosine",
"efConstruction": 200,
"maxConnections": 64
},
"shardingConfig": {
"desiredCount": 1, # Adjust according to your cluster size
"virtualPerPhysical": 128,
"strategy": "hash",
"key": "_id",
"function": "murmur3"
},
"replicationConfig": {
"factor": 1 # Number of copies for replication
},
"multiTenancyConfig": {
"enabled": False
},
"properties": [
{
"name": "text",
"description": "The text content",
"dataType": ["text"],
"tokenization": "word",
"indexFilterable": True,
"indexSearchable": True
}
]
}
try:
client.collections.create_from_dict(collection_obj)
print(f"创造索引 '{collection_name}'成功 .")
except weaviate.exceptions.UnexpectedStatusCodeException as e:
print(f"创造索引异常: {e}")
官网提供的远不止如此,可以参考:https://weaviate.io/developers/weaviate/client-libraries/python
4.Python Client v4 删除collection
def delete_collection(self, collection_name: str):
try:
client.collections.delete(collection_name)
except Exception as e:
print(f"删除索引{self.collection_name}异常:{e}")
它官网还提供根据uuid删除和批量删除
questions = client.collections.get("JeopardyQuestion")
deleted = questions.data.delete_by_id(uuid=new_uuid)
from weaviate.classes.query import Filter
questions = client.collections.get("JeopardyQuestion")
response = questions.data.delete_many(
where=Filter.by_property(name="question").equal("Test Question")
)
5.Python Client v4 向collection插入数据
注意:这里是分词后存储的数据
def save_documents(self, documents: List[Document]):
collection = self.client.collections.get(self.collection_name)
for doc in documents:
content = self.clean_text(doc.page_content)
vector = self.embedding_function.embed_query(content)
properties = {
"text": content
}
try:
uuid = collection.data.insert(properties=properties, vector=vector)
print(f"文档添加内容: {content[:30]}...,uuid: {uuid}")
except Exception as e:
print(f"添加文档异常: {e}")
如果只想测试功能,可以看关注循环里面就可以,self.clean_text是处理一些空格和\n\n可以不用关注,doc.page_content是文本,自己可以随便输入一些文字
注意:官网还提供插入大量数据的方法
questions = client.collections.get("JeopardyQuestion")
properties = [{"question": f"Test Question {i+1}"} for i in range(5)]
response = questions.data.insert_many(properties)
6.Python Client v4 从collection查询数据
def query_vector_collection(self, query: str, k: int) -> List[Document]:
vector = self.embedding_function.embed_query(query)
collection = self.client.collections.get(self.collection_name)
response = collection.query.near_vector(
near_vector=vector,
return_metadata=MetadataQuery(distance=True),
limit=k
)
# for o in response.objects:
# print(o.properties['text'])
# print(o.metadata.distance)
documents = [Document(page_content=res.properties['text']) for res in response.objects]
return documents
官网提供两种查询方法,里面可以添加更多的参数,官网可以自己看看:
https://weaviate.io/developers/weaviate/client-libraries/python
questions = client.collections.get("JeopardyQuestion")
response = questions.query.bm25(
query="animal",
limit=2
)
for o in response.objects:
print(o.properties) # Object properties
questions = client.collections.get("JeopardyQuestion")
response = questions.query.near_text(
query="animal",
limit=2
)
for o in response.objects:
print(o.properties) # Object properties