参考:【官方教程】ChatGLM + LangChain 实践培训_哔哩哔哩_bilibili
基于LangChain+LLM的本地知识库问答:从企业单文档问答到批量文档问答_langchain 本地知识库-CSDN博客
依据单个文档进行知识问答的大致流程:
加载本地文本 ----> 文本拆分 ----> 匹配文本(字符匹配、语义检索) ----> 构建prompt ----> LLM生成回答
基于本地知识库问答的实现原理:
1-6步骤:通过embedding模型构建本地向量数据库
7-10步骤:将用户提问内容进行embedding 然后后向量数据库进行匹配
11-13步骤:构建prompt模板
14-15步骤:大模型输出
实例:
from langchain.document_loaders import UnstructuredFileLoader # 加载文档的加载器
from langchain.text_splitter import CharacterTextSplitter # 分词器
# from langchain.embeddings.openai import OpenAIEmbeddings # embedding
from langchain.embeddings.huggingface import HuggingFaceBgeEmbeddings
from langchain.vectorstores import FAISS # 向量数据库
import time
# 加载文档
filepath = 'test.txt'
fileloader = UnstructuredFileLoader(filepath)
docs = fileloader.load()
# 文本分割
text_splitter = CharacterTextSplitter(chunk_size=500,chunk_overlap=100)
docs = text_splitter.split_documents(docs)
# context_string = [docs.page_content for docs in docs]
print('docs的长度:', type(docs))
print('docs的第一个数据:', type(docs[0]))
# print('context_string:', context_string)
# 构建向量数据库
# embeddings = OpenAIEmbeddings(openai_api_key='你的OpenAI的key')
embeddings = HuggingFaceBgeEmbeddings(model_name='bge-large-zh-v1.5')
# document_embeddings = embeddings.embed_documents(context_string)
vector_store = FAISS.from_documents(docs,embeddings)
# 加载模型
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained('Llama3-Chinese-8B-Instruct',trust_remote_code=True)
model = AutoModel.from_pretrained('Llama3-Chinese-8B-Instruct',trust_remote_code=True)
model.eval()
# 根据提问匹配上下文
query = "天空是什么颜色的?"
# query_embeddings = embeddings.embed_query(query)
related_docs = vector_store.similarity_search(query) # top_key默认是3个
print('related_docs:', related_docs)
context_string = ' '.join([docs.page_content for docs in docs])
# 构造prompt
prompt = tokenizer.encode(f"已知信息:{context_string}根据已知信息回答问题:\n{query}", return_tensors='pt')
# 生成回答
# 注意:你可能需要调整generate方法的参数,如max_length, num_beams等
s_time = time.time()
generated_text = model.generate(prompt, max_length=500, pad_token_id=tokenizer.eos_token_id)
print('comsuming time:', time.time() - s_time)
response = tokenizer.decode(generated_text[0], skip_special_tokens=True)
print("-" * 100)
print(response)
print('长度是:', len(response))
print("-" * 100)