在本篇文章中,我们将演示如何使用Gradient和LlamaIndex对Llama2-7b模型进行微调,以提高其在Text-to-SQL任务上的表现。本例子中,我们使用sql-create-context
数据集,这个数据集混合了WikiSQL和Spider,包含输入查询、上下文和真实的SQL语句。
环境准备
首先,安装必要的包:
%pip install llama-index-llms-gradient
%pip install llama-index-finetuning
加载数据
我们从Hugging Face数据集加载sql-create-context
数据。
from datasets import load_dataset
from pathlib import Path
import json
def load_jsonl(data_dir):
data_path = Path(data_dir).as_posix()
data = load_dataset("json", data_files=data_path)
return data
def save_jsonl(data_dicts, out_path):
with open(out_path, "w") as fp:
for data_dict in data_dicts:
fp.write(json.dumps(data_dict) + "\n")
def load_data_sql(data_dir: str = "data_sql"):
dataset = load_dataset("b-mc2/sql-create-context")
dataset_splits = {"train": dataset["train"]}
out_path = Path(data_dir)
out_path.parent.mkdir(parents=True, exist_ok=True)
for key, ds in dataset_splits.items():
with open(out_path, "w") as f:
for item in ds:
newitem = {
"input": item["question"],
"context": item["context"],
"output": item["answer"],
}
f.write(json.dumps(newitem) + "\n")
# dump data to data_sql
load_data_sql(data_dir="data_sql")
划分训练和验证集
将数据集划分为训练集和验证集。
from math import ceil
def get_train_val_splits(data_dir: str = "data_sql", val_ratio: float = 0.1, seed: int = 42, shuffle: bool = True):
data = load_jsonl(data_dir)
num_samples = len(data["train"])
val_set_size = ceil(val_ratio * num_samples)
train_val = data["train"].train_test_split(test_size=val_set_size, shuffle=shuffle, seed=seed)
return train_val["train"].shuffle(), train_val["test"].shuffle()
raw_train_data, raw_val_data = get_train_val_splits(data_dir="data_sql")
save_jsonl(raw_train_data, "train_data_raw.jsonl")
save_jsonl(raw_val_data, "val_data_raw.jsonl")
映射数据集字典到提示
定义函数将数据集字典映射到提示格式,以便我们可以将其发送到Gradient的微调端点。
def _generate_prompt_sql(input, context, dialect="sqlite", output=""):
system_message = f"""
You are a powerful text-to-SQL model. Your job is to answer questions about a database. You are given a question and context regarding one or more tables.
You must output the SQL query that answers the question.
"""
user_message = f"""
### Dialect:
{dialect}
### Input:
{input}
### Context:
{context}
### Response:
"""
if output:
return f"<s>### Instruction:\n{system_message}\n\n### Response:\n{output}</s>"
else:
return f"<s>### Instruction:\n{system_message}\n\n### Response:</s>"
def generate_prompt(data_point):
full_prompt = _generate_prompt_sql(
data_point["input"],
data_point["context"],
dialect="sqlite",
output=data_point["output"],
)
return {"inputs": full_prompt}
train_data = [
{"inputs": d["inputs"] for d in raw_train_data.map(generate_prompt)}
]
save_jsonl(train_data, "train_data.jsonl")
val_data = [{"inputs": d["inputs"] for d in raw_val_data.map(generate_prompt)}]
save_jsonl(val_data, "val_data.jsonl")
运行微调
使用Gradient的微调端点进行模型微调。
from llama_index.llms.gradient import GradientBaseModelLLM
from llama_index.finetuning import GradientFinetuneEngine
import os
os.environ["GRADIENT_ACCESS_TOKEN"] = "YOUR_GRADIENT_ACCESS_TOKEN" # 替换为实际的API密钥
os.environ["GRADIENT_WORKSPACE_ID"] = "YOUR_GRADIENT_WORKSPACE_ID" # 替换为实际的工作空间ID
base_model_slug = "llama2-7b-chat"
base_llm = GradientBaseModelLLM(base_model_slug=base_model_slug, max_tokens=300)
finetune_engine = GradientFinetuneEngine(
base_model_slug=base_model_slug,
name="text_to_sql",
data_path="train_data.jsonl",
verbose=True,
max_steps=200,
batch_size=4,
)
epochs = 1
for i in range(epochs):
print(f"** EPOCH {i} **")
finetune_engine.finetune()
ft_llm = finetune_engine.get_finetuned_model(max_tokens=300)
评估模型
我们在一些样本数据点上进行评估。
def get_text2sql_completion(llm, raw_datapoint):
text2sql_tmpl_str = _generate_prompt_sql(
raw_datapoint["input"],
raw_datapoint["context"],
dialect="sqlite",
output=None,
)
response = llm.complete(text2sql_tmpl_str)
return str(response)
test_datapoint = raw_val_data[2]
print(get_text2sql_completion(ft_llm, test_datapoint))
参考资料
错误示例及解决办法
-
API Key错误:确保已经正确设置了
GRADIENT_ACCESS_TOKEN
和GRADIENT_WORKSPACE_ID
环境变量。 -
路径错误:确保数据保存的路径正确,避免路径错误导致的数据读取失败。
如果你觉得这篇文章对你有帮助,请点赞,关注我的博客,谢谢!