初级菜鸟学Langchain实录!
本文记录利用GLM/Azure-openai检索数据库/文档生成答案的代码脚本!
这周我在学习Langchain做数据库和文档的检索增强生成。实不相瞒,我已经在工作中处于能力增长的停滞期,我得自己寻求机会去锻炼写代码的能力!这周学习Langchain框架,让我有了一点点如何快速看文档api写代码产出的经验认识。【我知道这个对于大佬来说很简单,对于我来说不是那么简单,甚至需要一点点工程直觉,后面熟练工了就越做越快。】
Part 1 没有用Langchain做table和text的RAG
这里就讲简单的retrieve的过程。
Table表格
方法1: 读入表格 markdown格式,嵌入template。
from zhipuai import ZhipuAI
import pandas as pd
import sqlalchemy
engine = sqlalchemy.create_engine('mysql+pymysql://{role}:{password}@localhost:{port}/{db}')
sql='''select * from major_list'''
df = pd.read_sql(sql,engine)
tb=df.to_markdown()
# print(tb)
client = ZhipuAI(api_key) # 填写您自己的APIKey
prompt_template = """请阅读下表的Markdown格式,然后根据表格回答问题。一行中的表单元格用“|”分隔,不同行用“n”分隔
问题:
{question}
表格:
{context}
"""
text2chatglm = prompt_template.format_map({
'question': "设计学类有什么专业?授予学位为什么?学制多久?",
'context': tb
})
这里举例为GLM的例子,Azure openai同理。
方法2:直接使用function call。
绕过langchain 定义数据库读取的方式。这里我只尝试了GLM的官方方法。
from zhipuai import ZhipuAI
import json
import pandas as pd
import sqlalchemy
engine = sqlalchemy.create_engine('mysql+pymysql://{role}:{password}@localhost:{port}/{db}')
sql='''
select * from major_list
'''
df = pd.read_sql(sql,engine)
tb=df.to_markdown()
def get_major_name(maj_category:str,maj_name:str,maj_code:str,degree_catgory:str,years:str,added_years:str):
if maj_category is not None:
df= df[df['maj_category']==maj_category]
if maj_name is not None:
df= df[df['maj_name']==maj_name]
if maj_code is not None:
df= df[df['maj_code']==maj_code]
if degree_catgory is not None:
df= df[df['degree_category']==degree_catgory]
if years is not None:
df= df[df['years']==years]
if added_years is not None:
df= df[df['added_years']==added_years]
tb=df.to_markdown()
return {"major_list":tb}
def parse_function_call(model_response,messages):
# 处理函数调用结果,根据模型返回参数,调用对应的函数。
# 调用函数返回结果后构造tool message,再次调用模型,将函数结果输入模型
# 模型会将函数调用结果以自然语言格式返回给用户。
if model_response.choices[0].message.tool_calls:
tool_call = model_response.choices[0].message.tool_calls[0]
args = tool_call.function.arguments
function_result = {}
if tool_call.function.name == "get_major_name":
function_result = get_flight_number(**json.loads(args))
messages.append({
"role": "tool",
"content": f"{json.dumps(function_result)}",
"tool_call_id":tool_call.id
})
response = client.chat.completions.create(
model="glm-4", # 填写需要调用的模型名称
messages=messages,
tools=tools,
)
print(response.choices[0].message)
messages.append(response.choices[0].message.model_dump())
client = ZhipuAI(api_key)
tools = [
{
"type": "function",
"function": {
"name": "get_flight_number",
"description": "根据专业大类或专业名或专业代码或学位授予分类或学制或增设年份查询专业信息",
"parameters": {
"type": "object",
"properties": {
"maj_category": {
"description": "专业大类",
"type": "string"
},
"maj_name": {
"description": "专业名字",
"type": "string"
},
"maj_code": {
"description": "专业代码",
"type": "string",
},
"degree_category": {
"description": "学位授予分类",
"type": "string"
},
"years": {
"description": "学制",
"type": "string"
},
"added_years": {
"description": "增设时间",
"type": "string",
}
}
},
}
}
]
messages = []
messages.append({"role": "system", "content": "不要假设或猜测传入函数的参数值。如果用户的描述不明确,请要求用户提供必要信息"})
messages.append({"role": "user", "content": "帮我查询工艺美术这个专业所有信息"})
response = client.chat.completions.create(
model="glm-4",
messages=messages,
tools=tools,
)
print(response.choices[0].message)
messages.append(response.choices[0].message.model_dump())
parse_function_call(response,messages)
这里的parse_function_call是GLM的SDK开放的工具。我的理解是大模型从用户输入的语言提炼出函数所需的参数变量然后进行传参完成数据库查询。
Text文字
过程朴素。。
读入文字,clean--> tokenize--> vectorize 成embedding -->计算相似度 -->取前几的答案为输入
详情见https://github.com/yuanzhoulvpi2017/DocumentSearch 脚本简单易懂,不再赘述。
Part2 用Langchain做table和text的RAG
Table表格
法1:运用Agent:
from langchain.agents import tool
from langchain_community.utilities.sql_database import SQLDatabase
Db=SQLDatabase.from_uri('mysql+pymysql://{role}:{password}@localhost:{port}/{db}')
from langchain_community.agent_toolkits import create_sql_agent
llm=get_glm(0.01)
agent_executor = create_sql_agent(llm, db=db, agent_type="openai-tools", verbose=True)
output_Res=agent_executor.invoke(
{"input":"2023年浙江大学招收北京地区学生高考分数线是多少分?"}
)['output']
法2:运用Chain:
from langchain_experimental.sql import SQLDatabaseChain
Db=SQLDatabase.from_uri('mysql+pymysql://{role}:{password}@localhost:{port}/{db}')
db_chain = SQLDatabaseChain.from_llm(llm, db, verbose=True)
Db_chain.run(“prompt”)
Text文字
RetrievalQA.from_chain_type
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
loader = TextLoader("../../state_of_the_union.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings()
docsearch = Chroma.from_documents(texts, embeddings)
qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=docsearch.as_retriever())
query = "What did the president say about Ketanji Brown Jackson"
qa.run(query)
Part3 使用Agent把文本多种文档组合起来!
这里只示范文本和数据库表格等等,别的我觉得就是差不多类似的写法!
主要用chain。Agent套来套去也可以,就是容易眼花。。
from langchain.agents import initialize_agent, Tool
from langchain.agents import AgentType
from langchain.tools import BaseTool
# from langchain import LLMMathChain, SerpAPIWrapper
import os
from langchain.agents import Tool
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import ChatOpenAI,AzureChatOpenAI
from langchain_text_splitters import CharacterTextSplitter
from pydantic.v1 import BaseModel,Field
import os
from langchain_experimental.sql import SQLDatabaseChain
from langchain_community.retrievers import BM25Retriever
from langchain.utilities import SQLDatabase
llm = AzureChatOpenAI() or llm=zhupuai() ##Whatever...
db = SQLDatabase.from_uri('mysql+pymysql://{role}:{password}@localhost:{port}/{db}')
db_chain = SQLDatabaseChain.from_llm(llm, db, verbose=True)
loader = PyPDFLoader("./CollegesandUniversities.pdf")
pages = loader.load_and_split()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(pages)
retriever = BM25Retriever.from_documents(docs)
# 创建一个功能列表,指明这个 agent 里面都有哪些可用工具,agent 执行过程可以看必知概念里的 Agent 那张图
tools = [
Tool(
name = "txt_search",
func=RetrievalQA.from_chain_type(llm=llm, retriever=retriever),
description="useful for when you need to answer questions about universities."
),
Tool(
name="db_search",
func=db_chain.run,
description="useful for when you need to answer questions about major."
)
]
# 初始化 agent
agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True)
# 执行 agent
res=agent({"input": "Do you know the name of the major whose major code is 020309T?"})["output"]
print("-----------------------------------")
print(res)
最后
https://gitee.com/cyz6668/langchain-simple-rag 整理好了,欢迎踩踩