安装
pip install chromadb
你的本地电脑需要有完整的c++的环境
Building wheels for collected packages: hnswlib
Building wheel for hnswlib (pyproject.toml) ... error
error: subprocess-exited-with-error
× Building wheel for hnswlib (pyproject.toml) did not run successfully.
│ exit code: 1
╰─> [12 lines of output]
running bdist_wheel
不然在编译hnswlib库的时候会报错
安装vs 勾选c++的桌面开发
使用教程
http://www.bimant.com/blog/chroma-vector-db/
获取chroma client对象
import chromadb
chroma_client = chromadb.Client()
#创建Chroma数据集
collection = chroma_client.create_collection(name="my_collection")
# 向Chroma数据集添加文档
collection.add(
documents=["This is a document", "This is another document"],
metadatas=[{"source": "my_source"}, {"source": "my_source"}],
ids=["id1", "id2"]
)
# 如果已经自己生成了嵌入向量,则可以直接加载:
# collection.add(
# embeddings=[[1.2, 2.3, 4.5], [6.7, 8.2, 9.2]],
# documents=["This is a document", "This is another document"],
# metadatas=[{"source": "my_source"}, {"source": "my_source"}],
# ids=["id1", "id2"]
# )
# 查询Chroma中的数据
results = collection.query(
query_texts=["This is a query document"],
n_results=2
)
print(results)
langchain的python版本要大于3.8
ERROR: Package 'langchain' requires a different Python: 3.7.16 not in '<4.0,>=3.8.1'
如果python的版本低于3.8,那么langchain的版本就会很低,就会导致网上很多的功能无法使用,这是一个大坑,遇到好多次了,妈的!
chroma向量数据库持久化存储,chromadb为我们提供chroma的客户端
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain import OpenAI, VectorDBQA
from langchain.chains import RetrievalQA
#导入文档加载器
import os
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import TokenTextSplitter
#指定chroma持久化的目录,当我们不知道目录时,chroma会将数据存储在内存中,随着程序的关闭就会删除
persist_directory = "./public"
#按目录加载文档
loader = DirectoryLoader(persist_directory, glob='**/*.txt')
docs = loader.load()
# #加载单个文档 可以自由选择
# loader = TextLoader('./public/五种风险管理策略.txt', encoding='utf8')
# docs = loader.load()
# 调用openai Embeddings
os.environ["OPENAI_API_KEY"] = ""
embeddings = OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"])
# 文档切块目的是为了防止超出GPTAPI的token限制
text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=0)
doc_texts = text_splitter.split_documents(docs)
# 向量化
vectordb = Chroma.from_documents(doc_texts, embeddings, persist_directory=persist_directory)
# 持久化
vectordb.persist()
#执行到这里你会发现public目录下多了一些以parquest结尾的文件,这些文件就是chroma持久化本地的向量数据
向量化搜索以及调用OpenAI回答
# 向量化搜索以及调用OpenAI回答
import streamlit as st
from streamlit_chat import message
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.vectorstores import FAISS
import tempfile
# 搜索
persist_directory = "./public"
os.environ["OPENAI_API_KEY"] = ""
embeddings = OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"])
question = "什么是风险管理?"
# 通过目录加载向量 这里的目录就是我们持久化的目录
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
# 向量搜索 根据你的问题进行本地向量搜索
docs = vectordb.similarity_search(question, k=4)
print(docs)
# 将搜索到的信息重新转换为向量 (直接查到向量数据还不会😑😑)
vectordb = Chroma.from_documents(docs, embeddings)
#将向量转换为OpenAI所需的格式
retriever = vectordb.as_retriever()
# 创建聊天机器人对象chain
chain = ConversationalRetrievalChain.from_llm(ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo"),retriever=retriever,return_source_documents=True)
chat_history = []
# 调用OpenAI接口实现AI会话
result = chain({"question": question, "chat_history": chat_history})
print(result)
#到这里就基本实现 Langchain+Chroma+OpenAI实现为GPT链接本地向量数据