用于树组织检索的递归抽象处理
RAPTOR: RECURSIVE ABSTRACTIVE PROCESSING FOR TREE-ORGANIZED RETRIEVALRaptor
gitee地址: lengyanju8/raptor
思想,二叉树->多叉树
用来增强RAG的检索效果
两步骤,构造树,搜索树
from raptor import RetrievalAugmentation
先看这个py
import logging
import pickle
from .cluster_tree_builder import ClusterTreeBuilder, ClusterTreeConfig
from .EmbeddingModels import BaseEmbeddingModel
from .QAModels import BaseQAModel, GPT3TurboQAModel
from .SummarizationModels import BaseSummarizationModel
from .tree_builder import TreeBuilder, TreeBuilderConfig
from .tree_retriever import TreeRetriever, TreeRetrieverConfig
from .tree_structures import Node, Tree
# Define a dictionary to map supported tree builders to their respective configs
supported_tree_builders = {"cluster": (ClusterTreeBuilder, ClusterTreeConfig)}
logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO)
# 默认配置
class RetrievalAugmentationConfig:
def __init__(
self,
tree_builder_config=None,
tree_retriever_config=None, # Change from default instantiation
qa_model=None,
embedding_model=None,
summarization_model=None,
tree_builder_type="cluster",
# New parameters for TreeRetrieverConfig and TreeBuilderConfig
# TreeRetrieverConfig arguments
tr_tokenizer=None,
tr_threshold=0.5,
tr_top_k=5,
tr_selection_mode="top_k",
tr_context_embedding_model="OpenAI",
tr_embedding_model=None,
tr_num_layers=None,
tr_start_layer=None,
# TreeBuilderConfig arguments
tb_tokenizer=None,
tb_max_tokens=100,
tb_num_layers=5,
tb_threshold=0.5,
tb_top_k=5,
tb_selection_mode="top_k",
tb_summarization_length=100,
tb_summarization_model=None,
tb_embedding_models=None,
tb_cluster_embedding_model="OpenAI",
):
# Validate tree_builder_type 验证 tree_builder_type
if tree_builder_type not in supported_tree_builders:
raise ValueError(
f"tree_builder_type must be one of {list(supported_tree_builders.keys())}"
)
# Validate qa_model 如果qa_model是空,并且不是BaseQAModel,则抛出异常
if qa_model is not None and not isinstance(qa_model, BaseQAModel):
raise ValueError("qa_model must be an instance of BaseQAModel")
# 校验embedding
if embedding_model is not None and not isinstance(
embedding_model, BaseEmbeddingModel
):
raise ValueError(
"embedding_model must be an instance of BaseEmbeddingModel"
)
# embedding模型不为空
elif embedding_model is not None:
# embedding_model和tb_embedding_models只能选一个。
if tb_embedding_models is not None:
raise ValueError(
"Only one of 'tb_embedding_models' or 'embedding_model' should be provided, not both."
)
tb_embedding_models = {"EMB": embedding_model}
tr_embedding_model = embedding_model
tb_cluster_embedding_model = "EMB"
tr_context_embedding_model = "EMB"
# 总结的模型
if summarization_model is not None and not isinstance(
summarization_model, BaseSummarizationModel
):
raise ValueError(
"summarization_model must be an instance of BaseSummarizationModel"
)
# tb_summarization_model和summarization_model只能选一个。
elif summarization_model is not None:
if tb_summarization_model is not None:
raise ValueError(
"Only one of 'tb_summarization_model' or 'summarization_model' should be provided, not both."
)
tb_summarization_model = summarization_model
# Set TreeBuilderConfig
tree_builder_class, tree_builder_config_class = supported_tree_builders[
tree_builder_type
]
# 如果tree_builder_config是空,调用tree_builder_config_class
if tree_builder_config is None:
tree_builder_config = tree_builder_config_class(
tokenizer=tb_tokenizer,
max_tokens=tb_max_tokens,
num_layers=tb_num_layers,
threshold=tb_threshold,
top_k=tb_top_k,
selection_mode=tb_selection_mode,
summarization_length=tb_summarization_length,
summarization_model=tb_summarization_model,
embedding_models=tb_embedding_models,
cluster_embedding_model=tb_cluster_embedding_model,
)
# 不是tree_builder_config_class配置,就抛出异常,必须是他的实例
elif not isinstance(tree_builder_config, tree_builder_config_class):
raise ValueError(
f"tree_builder_config must be a direct instance of {tree_builder_config_class} for tree_builder_type '{tree_builder_type}'"
)
# Set TreeRetrieverConfig
# 检索树的配置
if tree_retriever_config is None:
tree_retriever_config = TreeRetrieverConfig(
tokenizer=tr_tokenizer,
threshold=tr_threshold,
top_k=tr_top_k,
selection_mode=tr_selection_mode,
context_embedding_model=tr_context_embedding_model,
embedding_model=tr_embedding_model,
num_layers=tr_num_layers,
start_layer=tr_start_layer,
)
elif not isinstance(tree_retriever_config, TreeRetrieverConfig):
raise ValueError(
"tree_retriever_config must be an instance of TreeRetrieverConfig"
)
# Assign the created configurations to the instance
# 构造树和检索树
self.tree_builder_config = tree_builder_config
self.tree_retriever_config = tree_retriever_config
# qa_model的配置。要么是传进来的qa_model,要么就是GPT3TurboQAModel()
self.qa_model = qa_model or GPT3TurboQAModel()
self.tree_builder_type = tree_builder_type
# 加载配置,构建树的config, 检索树的config,qa_model,构建树的类型。
def log_config(self):
config_summary = """
RetrievalAugmentationConfig:
{tree_builder_config}
{tree_retriever_config}
QA Model: {qa_model}
Tree Builder Type: {tree_builder_type}
""".format(
tree_builder_config=self.tree_builder_config.log_config(),
tree_retriever_config=self.tree_retriever_config.log_config(),
qa_model=self.qa_model,
tree_builder_type=self.tree_builder_type,
)
return config_summary
# 检索树
class RetrievalAugmentation:
"""
A Retrieval Augmentation class that combines the TreeBuilder and TreeRetriever classes.
Enables adding documents to the tree, retrieving information, and answering questions.
"""
# 一种检索增强类,它结合了TreeBuilder和TreeRetriever两个类的功能。
# 支持向树中添加文档、检索信息及回答问题。
def __init__(self, config=None, tree=None):
"""
Initializes a RetrievalAugmentation instance with the specified configuration.
Args:
config (RetrievalAugmentationConfig): The configuration for the RetrievalAugmentation instance.
tree: The tree instance or the path to a pickled tree file.
"""
# 没有配置就加载默认配置
if config is None:
# 加载默认配置
config = RetrievalAugmentationConfig()
if not isinstance(config, RetrievalAugmentationConfig):
raise ValueError(
"config must be an instance of RetrievalAugmentationConfig"
)
# Check if tree is a string (indicating a path to a pickled tree)
# 树的类型,字符串,或者pickle文件。
if isinstance(tree, str):
try:
with open(tree, "rb") as file:
self.tree = pickle.load(file)
# 不是树的类型
if not isinstance(self.tree, Tree):
raise ValueError("The loaded object is not an instance of Tree")
except Exception as e:
raise ValueError(f"Failed to load tree from {tree}: {e}")
# 就是个树,或者为None
elif isinstance(tree, Tree) or tree is None:
self.tree = tree
else:
raise ValueError(
"tree must be an instance of Tree, a path to a pickled Tree, or None"
)
tree_builder_class = supported_tree_builders[config.tree_builder_type][0]
self.tree_builder = tree_builder_class(config.tree_builder_config)
self.tree_retriever_config = config.tree_retriever_config
self.qa_model = config.qa_model
# 不为None
if self.tree is not None:
self.retriever = TreeRetriever(self.tree_retriever_config, self.tree)
else:
self.retriever = None
# 构造完树了。
logging.info(
f"Successfully initialized RetrievalAugmentation with Config {config.log_config()}"
)
def add_documents(self, docs):
"""
Adds documents to the tree and creates a TreeRetriever instance.
Args:
docs (str): The input text to add to the tree.
"""
# 添加文档。
if self.tree is not None:
# 有文档的话,是要重新构造,还是在原来的基础上增加内容。
user_input = input(
"Warning: Overwriting existing tree. Did you mean to call 'add_to_existing' instead? (y/n): "
)
if user_input.lower() == "y": # 追加模式
# self.add_to_existing(docs)
return
# 重新构造
self.tree = self.tree_builder.build_from_text(text=docs)
# 构造检索树
self.retriever = TreeRetriever(self.tree_retriever_config, self.tree)
# 检索
def retrieve(
self,
question,
start_layer: int = None,
num_layers: int = None,
top_k: int = 10,
max_tokens: int = 3500, # (int, 默认3500): 最大token数量限制。
collapse_tree: bool = True, # (bool, 默认True): 是否折叠树结构进行检索。
return_layer_information: bool = True, # (bool, 默认True): 是否返回层级信息。
):
"""
Retrieves information and answers a question using the TreeRetriever instance.
Args:
question (str): The question to answer.
start_layer (int): The layer to start from. Defaults to self.start_layer.
num_layers (int): The number of layers to traverse. Defaults to self.num_layers.
max_tokens (int): The maximum number of tokens. Defaults to 3500.
use_all_information (bool): Whether to retrieve information from all nodes. Defaults to False.
Returns:
str: The context from which the answer can be found.
Raises:
ValueError: If the TreeRetriever instance has not been initialized.
"""
# 检索器为空的判断。
if self.retriever is None:
raise ValueError(
"The TreeRetriever instance has not been initialized. Call 'add_documents' first."
)
return self.retriever.retrieve(
question,
start_layer,
num_layers,
top_k,
max_tokens,
collapse_tree,
return_layer_information,
)
def answer_question(
self,
question,
top_k: int = 10,
start_layer: int = None,
num_layers: int = None,
max_tokens: int = 3500,
collapse_tree: bool = True,
return_layer_information: bool = False,
):
"""
Retrieves information and answers a question using the TreeRetriever instance.
Args:
question (str): The question to answer.
start_layer (int): The layer to start from. Defaults to self.start_layer.
num_layers (int): The number of layers to traverse. Defaults to self.num_layers.
max_tokens (int): The maximum number of tokens. Defaults to 3500.
use_all_information (bool): Whether to retrieve information from all nodes. Defaults to False.
Returns:
str: The answer to the question.
Raises:
ValueError: If the TreeRetriever instance has not been initialized.
"""
# if return_layer_information:
context, layer_information = self.retrieve(
question, start_layer, num_layers, top_k, max_tokens, collapse_tree, True
)
answer = self.qa_model.answer_question(context, question)
# 返回层的信息
if return_layer_information:
return answer, layer_information
return answer
# 保存树
def save(self, path):
if self.tree is None:
raise ValueError("There is no tree to save.")
with open(path, "wb") as file:
pickle.dump(self.tree, file)
logging.info(f"Tree successfully saved to {path}")
构造树
import copy
import logging
import os
from abc import abstractclassmethod
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
from typing import Dict, List, Optional, Set, Tuple
import openai
import tiktoken
from tenacity import retry, stop_after_attempt, wait_random_exponential
from .EmbeddingModels import BaseEmbeddingModel, OpenAIEmbeddingModel
from .SummarizationModels import (BaseSummarizationModel,
GPT3TurboSummarizationModel)
from .tree_structures import Node, Tree
from .utils import (distances_from_embeddings, get_children, get_embeddings,
get_node_list, get_text,
indices_of_nearest_neighbors_from_distances, split_text)
logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO)
# 构造树的配置文件。
class TreeBuilderConfig:
# 检查配置
def __init__(
self,
tokenizer=None,
max_tokens=None,
num_layers=None,
threshold=None,
top_k=None,
selection_mode=None,
summarization_length=None,
summarization_model=None,
embedding_models=None,
cluster_embedding_model=None,
):
# 没有指定tokenizer就用tiktoken的
if tokenizer is None:
tokenizer = tiktoken.get_encoding("cl100k_base")
self.tokenizer = tokenizer
# 设置默认max_tokens=100
if max_tokens is None:
max_tokens = 100
# 检查max_tokens是否为整数,或者小于1
if not isinstance(max_tokens, int) or max_tokens < 1:
raise ValueError("max_tokens must be an integer and at least 1")
self.max_tokens = max_tokens
# num_layers没传参的就默认为5,咱们树太深了也不好对吧。
if num_layers is None:
num_layers = 5
# num_layers有效性检验。
if not isinstance(num_layers, int) or num_layers < 1:
raise ValueError("num_layers must be an integer and at least 1")
self.num_layers = num_layers
# threshold 默认配置和检查。
if threshold is None:
threshold = 0.5
if not isinstance(threshold, (int, float)) or not (0 <= threshold <= 1):
raise ValueError("threshold must be a number between 0 and 1")
self.threshold = threshold
if top_k is None:
top_k = 5
if not isinstance(top_k, int) or top_k < 1:
raise ValueError("top_k must be an integer and at least 1")
self.top_k = top_k
if selection_mode is None:
selection_mode = "top_k"
if selection_mode not in ["top_k", "threshold"]:
raise ValueError("selection_mode must be either 'top_k' or 'threshold'")
self.selection_mode = selection_mode
if summarization_length is None:
summarization_length = 100
self.summarization_length = summarization_length
if summarization_model is None:
summarization_model = GPT3TurboSummarizationModel()
if not isinstance(summarization_model, BaseSummarizationModel):
raise ValueError(
"summarization_model must be an instance of BaseSummarizationModel"
)
self.summarization_model = summarization_model
# 设置 embedding模型。
if embedding_models is None:
embedding_models = {"OpenAI": OpenAIEmbeddingModel()}
if not isinstance(embedding_models, dict):
raise ValueError(
"embedding_models must be a dictionary of model_name: instance pairs"
)
# 逐个检查embedding_models的类型。
for model in embedding_models.values():
if not isinstance(model, BaseEmbeddingModel):
raise ValueError(
"All embedding models must be an instance of BaseEmbeddingModel"
)
self.embedding_models = embedding_models
if cluster_embedding_model is None:
cluster_embedding_model = "OpenAI"
if cluster_embedding_model not in self.embedding_models:
raise ValueError(
"cluster_embedding_model must be a key in the embedding_models dictionary"
)
self.cluster_embedding_model = cluster_embedding_model
# 加载配置
def log_config(self):
config_log = """
TreeBuilderConfig:
Tokenizer: {tokenizer}
Max Tokens: {max_tokens}
Num Layers: {num_layers}
Threshold: {threshold}
Top K: {top_k}
Selection Mode: {selection_mode}
Summarization Length: {summarization_length}
Summarization Model: {summarization_model}
Embedding Models: {embedding_models}
Cluster Embedding Model: {cluster_embedding_model}
""".format(
tokenizer=self.tokenizer,
max_tokens=self.max_tokens,
num_layers=self.num_layers,
threshold=self.threshold,
top_k=self.top_k,
selection_mode=self.selection_mode,
summarization_length=self.summarization_length,
summarization_model=self.summarization_model,
embedding_models=self.embedding_models,
cluster_embedding_model=self.cluster_embedding_model,
)
return config_log
class TreeBuilder:
"""
The TreeBuilder class is responsible for building a hierarchical text abstraction
structure, known as a "tree," using summarization models and
embedding models.
"""
# TreeBuilder类负责构建一种层次化的文本抽象结构,这种结构被称为“树”。它利用摘要模型和嵌入模型来实现这一构建过程。
# 初始化,加载配置
def __init__(self, config) -> None:
"""Initializes the tokenizer, maximum tokens, number of layers, top-k value, threshold, and selection mode."""
self.tokenizer = config.tokenizer
self.max_tokens = config.max_tokens
self.num_layers = config.num_layers
self.top_k = config.top_k
self.threshold = config.threshold
self.selection_mode = config.selection_mode
self.summarization_length = config.summarization_length
self.summarization_model = config.summarization_model
self.embedding_models = config.embedding_models
self.cluster_embedding_model = config.cluster_embedding_model
logging.info(
f"Successfully initialized TreeBuilder with Config {config.log_config()}"
)
# 创建节点。
def create_node(
self, index: int, text: str, children_indices: Optional[Set[int]] = None
) -> Tuple[int, Node]:
"""Creates a new node with the given index, text, and (optionally) children indices.
Args:
index (int): The index of the new node.
text (str): The text associated with the new node.
children_indices (Optional[Set[int]]): A set of indices representing the children of the new node.
If not provided, an empty set will be used.
Returns:
Tuple[int, Node]: A tuple containing the index and the newly created node.
"""
# children_indices 么有设置,就设置成一个空的集合。
if children_indices is None:
children_indices = set()
embeddings = {
model_name: model.create_embedding(text)
for model_name, model in self.embedding_models.items()
}
# 去看Node的定义。tree_structures.py里面
return (index, Node(text, index, children_indices, embeddings))
# 创建embedding
def create_embedding(self, text) -> List[float]:
"""
Generates embeddings for the given text using the specified embedding model.
Args:
text (str): The text for which to generate embeddings.
Returns:
List[float]: The generated embeddings.
"""
return self.embedding_models[self.cluster_embedding_model].create_embedding(
text
)
# 摘要
def summarize(self, context, max_tokens=150) -> str:
"""
Generates a summary of the input context using the specified summarization model.
Args:
context (str, optional): The context to summarize.
max_tokens (int, optional): The maximum number of tokens in the generated summary. Defaults to 150.o
Returns:
str: The generated summary.
"""
# 生成摘要在max_tokens内
return self.summarization_model.summarize(context, max_tokens)
# 获取相关的节点
def get_relevant_nodes(self, current_node, list_nodes) -> List[Node]:
"""
Retrieves the top-k most relevant nodes to the current node from the list of nodes
based on cosine distance in the embedding space.
从节点列表中根据嵌入空间中的余弦距离检索与当前节点最相关的前k个节点。
Args:
current_node (Node): The current node.
list_nodes (List[Node]): The list of nodes.
Returns:
List[Node]: The top-k most relevant nodes.
"""
# 获取每个节点的embedding
embeddings = get_embeddings(list_nodes, self.cluster_embedding_model)
# 求距离
distances = distances_from_embeddings(
current_node.embeddings[self.cluster_embedding_model], embeddings
)
# 求出近邻距离就是用np.argsort(distances) 排个序
indices = indices_of_nearest_neighbors_from_distances(distances)
# 如果设置了threshold, 就先灭掉小于threshold的
if self.selection_mode == "threshold":
best_indices = [
index for index in indices if distances[index] > self.threshold
]
# 用top_k的模式,就取前多少个。
elif self.selection_mode == "top_k":
best_indices = indices[: self.top_k]
# 拿到近邻关系比如最近邻的10个,5个。
nodes_to_add = [list_nodes[idx] for idx in best_indices]
return nodes_to_add
# 多线程的创建叶子结点
def multithreaded_create_leaf_nodes(self, chunks: List[str]) -> Dict[int, Node]:
"""Creates leaf nodes using multithreading from the given list of text chunks.
Args:
chunks (List[str]): A list of text chunks to be turned into leaf nodes.
Returns:
Dict[int, Node]: A dictionary mapping node indices to the corresponding leaf nodes.
"""
# 建立个线程池
with ThreadPoolExecutor() as executor:
# 对于chunks列表中的每一个(index, text)对使用executor.submit异步地调用self.create_node方法,并将任务的未来结果(Future对象)作为字典的键,将(index, text)对作为值存储。这样就为每个文本块提交了一个创建节点的任务。
future_nodes = {
executor.submit(self.create_node, index, text): (index, text)
for index, text in enumerate(chunks)
}
leaf_nodes = {}
# 使用as_completed函数监控future_nodes中任务的完成情况。每当一个任务完成,就从该任务的result()方法获取结果(即新创建的节点及其索引),并将节点存储到leaf_nodes字典中,索引作为键,节点对象作为值。
for future in as_completed(future_nodes):
index, node = future.result()
leaf_nodes[index] = node
return leaf_nodes
# 从text中构造
def build_from_text(self, text: str, use_multithreading: bool = True) -> Tree:
"""Builds a golden tree from the input text, optionally using multithreading.
Args:
text (str): The input text.
use_multithreading (bool, optional): Whether to use multithreading when creating leaf nodes.
Default: True.
Returns:
Tree: The golden tree structure.
"""
chunks = split_text(text, self.tokenizer, self.max_tokens)
logging.info("Creating Leaf Nodes")
if use_multithreading:
# 使用多线程构造叶子结点。
leaf_nodes = self.multithreaded_create_leaf_nodes(chunks)
else:
leaf_nodes = {}
for index, text in enumerate(chunks):
# 返回的是一个元组,第一个是index,因为上面有,就不用了
__, node = self.create_node(index, text)
leaf_nodes[index] = node
layer_to_nodes = {0: list(leaf_nodes.values())}
# 创建了多少叶子结点
logging.info(f"Created {len(leaf_nodes)} Leaf Embeddings")
logging.info("Building All Nodes")
all_nodes = copy.deepcopy(leaf_nodes)
#
# 根节点的
root_nodes = self.construct_tree(all_nodes, all_nodes, layer_to_nodes)
tree = Tree(all_nodes, root_nodes, leaf_nodes, self.num_layers, layer_to_nodes)
return tree
@abstractclassmethod
def construct_tree(
self,
current_level_nodes: Dict[int, Node],
all_tree_nodes: Dict[int, Node],
layer_to_nodes: Dict[int, List[Node]],
use_multithreading: bool = True,
) -> Dict[int, Node]:
"""
Constructs the hierarchical tree structure layer by layer by iteratively summarizing groups
of relevant nodes and updating the current_level_nodes and all_tree_nodes dictionaries at each step.
Args:
current_level_nodes (Dict[int, Node]): The current set of nodes.
all_tree_nodes (Dict[int, Node]): The dictionary of all nodes.
use_multithreading (bool): Whether to use multithreading to speed up the process.
Returns:
Dict[int, Node]: The final set of root nodes.
"""
pass
# logging.info("Using Transformer-like TreeBuilder")
# def process_node(idx, current_level_nodes, new_level_nodes, all_tree_nodes, next_node_index, lock):
# relevant_nodes_chunk = self.get_relevant_nodes(
# current_level_nodes[idx], current_level_nodes
# )
# node_texts = get_text(relevant_nodes_chunk)
# summarized_text = self.summarize(
# context=node_texts,
# max_tokens=self.summarization_length,
# )
# logging.info(
# f"Node Texts Length: {len(self.tokenizer.encode(node_texts))}, Summarized Text Length: {len(self.tokenizer.encode(summarized_text))}"
# )
# next_node_index, new_parent_node = self.create_node(
# next_node_index,
# summarized_text,
# {node.index for node in relevant_nodes_chunk}
# )
# with lock:
# new_level_nodes[next_node_index] = new_parent_node
# for layer in range(self.num_layers):
# logging.info(f"Constructing Layer {layer}: ")
# node_list_current_layer = get_node_list(current_level_nodes)
# next_node_index = len(all_tree_nodes)
# new_level_nodes = {}
# lock = Lock()
# if use_multithreading:
# with ThreadPoolExecutor() as executor:
# for idx in range(0, len(node_list_current_layer)):
# executor.submit(process_node, idx, node_list_current_layer, new_level_nodes, all_tree_nodes, next_node_index, lock)
# next_node_index += 1
# executor.shutdown(wait=True)
# else:
# for idx in range(0, len(node_list_current_layer)):
# process_node(idx, node_list_current_layer, new_level_nodes, all_tree_nodes, next_node_index, lock)
# layer_to_nodes[layer + 1] = list(new_level_nodes.values())
# current_level_nodes = new_level_nodes
# all_tree_nodes.update(new_level_nodes)
# return new_level_nodes
检索树
import logging
import os
from typing import Dict, List, Set
import tiktoken
from tenacity import retry, stop_after_attempt, wait_random_exponential
from .EmbeddingModels import BaseEmbeddingModel, OpenAIEmbeddingModel
from .Retrievers import BaseRetriever
from .tree_structures import Node, Tree
from .utils import (distances_from_embeddings, get_children, get_embeddings,
get_node_list, get_text,
indices_of_nearest_neighbors_from_distances,
reverse_mapping)
logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO)
# 检索树的配置
class TreeRetrieverConfig:
# 检索树的配置以及未传参后的默认配置。
def __init__(
self,
tokenizer=None, # 相对于构建树而言少了max_tokens=None,
threshold=None,
top_k=None,
selection_mode=None,
context_embedding_model=None, # 多了这个。对于构建树而言少了少了summarization_length,summarization_model
embedding_model=None,
num_layers=None,
start_layer=None, # 对于构建树而言少了少了cluster_embedding_model
):
if tokenizer is None:
tokenizer = tiktoken.get_encoding("cl100k_base")
self.tokenizer = tokenizer
if threshold is None:
threshold = 0.5
if not isinstance(threshold, float) or not (0 <= threshold <= 1):
raise ValueError("threshold must be a float between 0 and 1")
self.threshold = threshold
if top_k is None:
top_k = 5
if not isinstance(top_k, int) or top_k < 1:
raise ValueError("top_k must be an integer and at least 1")
self.top_k = top_k
if selection_mode is None:
selection_mode = "top_k"
if not isinstance(selection_mode, str) or selection_mode not in [
"top_k",
"threshold",
]:
raise ValueError(
"selection_mode must be a string and either 'top_k' or 'threshold'"
)
self.selection_mode = selection_mode
if context_embedding_model is None:
context_embedding_model = "OpenAI"
if not isinstance(context_embedding_model, str):
raise ValueError("context_embedding_model must be a string")
self.context_embedding_model = context_embedding_model
if embedding_model is None:
embedding_model = OpenAIEmbeddingModel()
if not isinstance(embedding_model, BaseEmbeddingModel):
raise ValueError(
"embedding_model must be an instance of BaseEmbeddingModel"
)
self.embedding_model = embedding_model
if num_layers is not None:
if not isinstance(num_layers, int) or num_layers < 0:
raise ValueError("num_layers must be an integer and at least 0")
self.num_layers = num_layers
if start_layer is not None:
if not isinstance(start_layer, int) or start_layer < 0:
raise ValueError("start_layer must be an integer and at least 0")
self.start_layer = start_layer
# 加载配置。
def log_config(self):
config_log = """
TreeRetrieverConfig:
Tokenizer: {tokenizer}
Threshold: {threshold}
Top K: {top_k}
Selection Mode: {selection_mode}
Context Embedding Model: {context_embedding_model}
Embedding Model: {embedding_model}
Num Layers: {num_layers}
Start Layer: {start_layer}
""".format(
tokenizer=self.tokenizer,
threshold=self.threshold,
top_k=self.top_k,
selection_mode=self.selection_mode,
context_embedding_model=self.context_embedding_model,
embedding_model=self.embedding_model,
num_layers=self.num_layers,
start_layer=self.start_layer,
)
return config_log
# 检索树
class TreeRetriever(BaseRetriever):
# 初始话和检查配置。
def __init__(self, config, tree) -> None:
if not isinstance(tree, Tree):
raise ValueError("tree must be an instance of Tree")
# 总的层数
if config.num_layers is not None and config.num_layers > tree.num_layers + 1:
raise ValueError(
"num_layers in config must be less than or equal to tree.num_layers + 1"
)
# 开始检索的层数
if config.start_layer is not None and config.start_layer > tree.num_layers:
raise ValueError(
"start_layer in config must be less than or equal to tree.num_layers"
)
self.tree = tree
self.num_layers = (
config.num_layers if config.num_layers is not None else tree.num_layers + 1
)
self.start_layer = (
config.start_layer if config.start_layer is not None else tree.num_layers
)
if self.num_layers > self.start_layer + 1:
raise ValueError("num_layers must be less than or equal to start_layer + 1")
self.tokenizer = config.tokenizer
self.top_k = config.top_k
self.threshold = config.threshold
self.selection_mode = config.selection_mode
self.embedding_model = config.embedding_model
self.context_embedding_model = config.context_embedding_model
self.tree_node_index_to_layer = reverse_mapping(self.tree.layer_to_nodes)
logging.info(
f"Successfully initialized TreeRetriever with Config {config.log_config()}"
)
# 创建embedding
def create_embedding(self, text: str) -> List[float]:
"""
Generates embeddings for the given text using the specified embedding model.
Args:
text (str): The text for which to generate embeddings.
Returns:
List[float]: The generated embeddings.
"""
return self.embedding_model.create_embedding(text)
# 从折叠的树中检索
def retrieve_information_collapse_tree(self, query: str, top_k: int, max_tokens: int) -> str:
"""
Retrieves the most relevant information from the tree based on the query.
Args:
query (str): The query text.
max_tokens (int): The maximum number of tokens.
Returns:
str: The context created using the most relevant nodes.
"""
# 把问题转化为embedding
query_embedding = self.create_embedding(query)
# 候选节点。
selected_nodes = []
# 按照node的顺序排序的节点列表
node_list = get_node_list(self.tree.all_nodes)
# 获取每个节点的embedding
embeddings = get_embeddings(node_list, self.context_embedding_model)
distances = distances_from_embeddings(query_embedding, embeddings)
# 计算相似度。
indices = indices_of_nearest_neighbors_from_distances(distances)
total_tokens = 0
for idx in indices[:top_k]:
node = node_list[idx]
node_tokens = len(self.tokenizer.encode(node.text))
if total_tokens + node_tokens > max_tokens:
break
selected_nodes.append(node)
total_tokens += node_tokens
# 拿到保存的text
context = get_text(selected_nodes)
return selected_nodes, context
def retrieve_information(
self, current_nodes: List[Node], query: str, num_layers: int
) -> str:
"""
Retrieves the most relevant information from the tree based on the query.
Args:
current_nodes (List[Node]): A List of the current nodes.
query (str): The query text.
num_layers (int): The number of layers to traverse.
Returns:
str: The context created using the most relevant nodes.
"""
query_embedding = self.create_embedding(query)
selected_nodes = []
node_list = current_nodes
for layer in range(num_layers):
# 拿到embedding
embeddings = get_embeddings(node_list, self.context_embedding_model)
# 算出距离
distances = distances_from_embeddings(query_embedding, embeddings)
# 求出最小距离
indices = indices_of_nearest_neighbors_from_distances(distances)
# 设置了threshold,去掉小于threshold的
if self.selection_mode == "threshold":
best_indices = [
index for index in indices if distances[index] > self.threshold
]
# 设置了top_k
elif self.selection_mode == "top_k":
best_indices = indices[: self.top_k]
nodes_to_add = [node_list[idx] for idx in best_indices]
selected_nodes.extend(nodes_to_add)
# 如果层数不相等
if layer != num_layers - 1:
# 把它的子节点拿出来。
child_nodes = []
for index in best_indices:
# 遍历best_indices的子节点。
child_nodes.extend(node_list[index].children)
# take the unique values
# dict.fromkeys创建并返回一个新的字典。两个参数:第一个是字典的键,第二个(可选)是传入键的值,默认为None。
child_nodes = list(dict.fromkeys(child_nodes))
node_list = [self.tree.all_nodes[i] for i in child_nodes]
context = get_text(selected_nodes)
return selected_nodes, context
def retrieve(
self,
query: str,
start_layer: int = None,
num_layers: int = None,
top_k: int = 10,
max_tokens: int = 3500,
collapse_tree: bool = True,
return_layer_information: bool = False,
) -> str:
"""
Queries the tree and returns the most relevant information.
Args:
query (str): The query text.
start_layer (int): The layer to start from. Defaults to self.start_layer.
num_layers (int): The number of layers to traverse. Defaults to self.num_layers.
max_tokens (int): The maximum number of tokens. Defaults to 3500.
collapse_tree (bool): Whether to retrieve information from all nodes. Defaults to False.
Returns:
str: The result of the query.
"""
# 对输入的参数检查
if not isinstance(query, str):
raise ValueError("query must be a string")
if not isinstance(max_tokens, int) or max_tokens < 1:
raise ValueError("max_tokens must be an integer and at least 1")
if not isinstance(collapse_tree, bool):
raise ValueError("collapse_tree must be a boolean")
# Set defaults
start_layer = self.start_layer if start_layer is None else start_layer
num_layers = self.num_layers if num_layers is None else num_layers
if not isinstance(start_layer, int) or not (
0 <= start_layer <= self.tree.num_layers
):
raise ValueError(
"start_layer must be an integer between 0 and tree.num_layers"
)
if not isinstance(num_layers, int) or num_layers < 1:
raise ValueError("num_layers must be an integer and at least 1")
if num_layers > (start_layer + 1):
raise ValueError("num_layers must be less than or equal to start_layer + 1")
# 到这里开始检索两种检索方式,折叠树
if collapse_tree:
logging.info(f"Using collapsed_tree")
selected_nodes, context = self.retrieve_information_collapse_tree(
query, top_k, max_tokens
)
else:
layer_nodes = self.tree.layer_to_nodes[start_layer]
# 得到选中的节点,和内容。
selected_nodes, context = self.retrieve_information(
layer_nodes, query, num_layers
)
# 要求返回层数信息
if return_layer_information:
layer_information = []
for node in selected_nodes:
layer_information.append(
{
"node_index": node.index,
"layer_number": self.tree_node_index_to_layer[node.index],
}
)
return context, layer_information
# 不要求返回层数信息就直接返回内容
return context
树的结构,跟二叉树或者多叉树差不多
from typing import Dict, List, Set
# 树结构的 定义
# 节点。
class Node:
"""
Represents a node in the hierarchical tree structure.
"""
def __init__(self, text: str, index: int, children: Set[int], embeddings) -> None:
self.text = text # 树的文本
self.index = index # 树的索引
self.children = children # 树的子孩子
self.embeddings = embeddings # 树内容的embedding
class Tree:
"""
Represents the entire hierarchical tree structure.
"""
def __init__(
self, all_nodes, root_nodes, leaf_nodes, num_layers, layer_to_nodes
) -> None:
self.all_nodes = all_nodes # 所有的节点。
self.root_nodes = root_nodes # 根节点
self.leaf_nodes = leaf_nodes # 叶子结点。
self.num_layers = num_layers # 层数
self.layer_to_nodes = layer_to_nodes #
QA_model
import logging
import os
from openai import OpenAI
import getpass # 用来输入密码的
from abc import ABC, abstractmethod
import torch
from tenacity import retry, stop_after_attempt, wait_random_exponential
from transformers import T5ForConditionalGeneration, T5Tokenizer
class BaseQAModel(ABC):
@abstractmethod
def answer_question(self, context, question):
pass
# GPT3 QA模型
class GPT3QAModel(BaseQAModel):
# 初始化
def __init__(self, model="text-davinci-003"):
"""
Initializes the GPT-3 model with the specified model version.
Args:
model (str, optional): The GPT-3 model version to use for generating summaries. Defaults to "text-davinci-003".
"""
self.model = model
# 要输入openai的key
self.client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
# retry是一个用于错误处理的模块,功能类似try-except,但更加快捷方便。
# 装饰器 @retry: 这个装饰器来自于如 tenacity 或类似库,用来控制函数重试行为。这里的参数 wait=wait_random_exponential(min=1, max=20) 指定了重试等待时间是随机指数分布的,最小等待1秒,最大20秒,以减少重试时的请求峰值。stop=stop_after_attempt(6) 表示如果尝试次数达到6次仍然失败,则停止重试。
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def answer_question(self, context, question, max_tokens=150, stop_sequence=None):
"""
Generates a summary of the given context using the GPT-3 model.
Args:
context (str): The text to summarize.
max_tokens (int, optional): The maximum number of tokens in the generated summary. Defaults to 150.
stop_sequence (str, optional): The sequence at which to stop summarization. Defaults to None.
Returns:
str: The generated summary.
"""
try:
response = self.client.completions.create(
prompt=f"using the folloing information {context}. Answer the following question in less than 5-7 words, if possible: {question}",
temperature=0,
max_tokens=max_tokens, # 生成答案的最大令牌数,默认为150。这是一个可选参数。
top_p=1,
frequency_penalty=0,
presence_penalty=0,
stop=stop_sequence,
model=self.model,
)
return response.choices[0].text.strip()
except Exception as e:
print(e)
return ""
class GPT3TurboQAModel(BaseQAModel):
def __init__(self, model="gpt-3.5-turbo"):
"""
Initializes the GPT-3 model with the specified model version.
Args:
model (str, optional): The GPT-3 model version to use for generating summaries. Defaults to "text-davinci-003".
"""
self.model = model
self.client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def _attempt_answer_question(
self, context, question, max_tokens=150, stop_sequence=None
):
"""
Generates a summary of the given context using the GPT-3 model.
Args:
context (str): The text to summarize.
max_tokens (int, optional): The maximum number of tokens in the generated summary. Defaults to 150.
stop_sequence (str, optional): The sequence at which to stop summarization. Defaults to None.
Returns:
str: The generated summary.
"""
response = self.client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": "You are Question Answering Portal"},
{
"role": "user",
"content": f"Given Context: {context} Give the best full answer amongst the option to question {question}",
},
],
temperature=0,
)
return response.choices[0].message.content.strip()
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def answer_question(self, context, question, max_tokens=150, stop_sequence=None):
try:
return self._attempt_answer_question(
context, question, max_tokens=max_tokens, stop_sequence=stop_sequence
)
except Exception as e:
print(e)
return e
class GPT4QAModel(BaseQAModel):
def __init__(self, model="gpt-4"):
"""
Initializes the GPT-3 model with the specified model version.
Args:
model (str, optional): The GPT-3 model version to use for generating summaries. Defaults to "text-davinci-003".
"""
self.model = model
self.client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def _attempt_answer_question(
self, context, question, max_tokens=150, stop_sequence=None
):
"""
Generates a summary of the given context using the GPT-3 model.
Args:
context (str): The text to summarize.
max_tokens (int, optional): The maximum number of tokens in the generated summary. Defaults to 150.
stop_sequence (str, optional): The sequence at which to stop summarization. Defaults to None.
Returns:
str: The generated summary.
"""
response = self.client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": "You are Question Answering Portal"},
{
"role": "user",
"content": f"Given Context: {context} Give the best full answer amongst the option to question {question}",
},
],
temperature=0,
)
return response.choices[0].message.content.strip()
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def answer_question(self, context, question, max_tokens=150, stop_sequence=None):
try:
return self._attempt_answer_question(
context, question, max_tokens=max_tokens, stop_sequence=stop_sequence
)
except Exception as e:
print(e)
return e
# 统一模型。用来自自定义模型。
class UnifiedQAModel(BaseQAModel):
def __init__(self, model_name="allenai/unifiedqa-v2-t5-3b-1363200"):
# 放到设备上
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 模型加载
self.model = T5ForConditionalGeneration.from_pretrained(model_name).to(
self.device
)
# 分词器加载
self.tokenizer = T5Tokenizer.from_pretrained(model_name)
def run_model(self, input_string, **generator_args):
input_ids = self.tokenizer.encode(input_string, return_tensors="pt").to(
self.device
)
res = self.model.generate(input_ids, **generator_args)
return self.tokenizer.batch_decode(res, skip_special_tokens=True)
def answer_question(self, context, question):
input_string = question + " \\n " + context
output = self.run_model(input_string)
return output[0]
embedding_model
import logging
from abc import ABC, abstractmethod
from openai import OpenAI
from sentence_transformers import SentenceTransformer
from tenacity import retry, stop_after_attempt, wait_random_exponential
logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO)
# embedding模型。
class BaseEmbeddingModel(ABC):
# 重写create_embedding方法
@abstractmethod
def create_embedding(self, text):
pass
# 给出了示例
class OpenAIEmbeddingModel(BaseEmbeddingModel):
def __init__(self, model="text-embedding-ada-002"):
self.client = OpenAI()
self.model = model
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def create_embedding(self, text):
# \n替换为
text = text.replace("\n", " ")
return (
self.client.embeddings.create(input=[text], model=self.model)
.data[0]
.embedding
)
class SBertEmbeddingModel(BaseEmbeddingModel):
def __init__(self, model_name="sentence-transformers/multi-qa-mpnet-base-cos-v1"):
# 加载模型
self.model = SentenceTransformer(model_name)
def create_embedding(self, text):
# 编码
return self.model.encode(text)
用来总结摘要的model
import logging
import os
from abc import ABC, abstractmethod
from openai import OpenAI
from tenacity import retry, stop_after_attempt, wait_random_exponential
logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.INFO)
# 总结模型接口
class BaseSummarizationModel(ABC):
@abstractmethod
def summarize(self, context, max_tokens=150):
pass
# GPT3-Turbo模型
class GPT3TurboSummarizationModel(BaseSummarizationModel):
def __init__(self, model="gpt-3.5-turbo"):
self.model = model
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def summarize(self, context, max_tokens=500, stop_sequence=None):
try:
client = OpenAI()
# 通过提示词来进行总结。
response = client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": f"Write a summary of the following, including as many key details as possible: {context}:",
},
],
max_tokens=max_tokens,
)
return response.choices[0].message.content
except Exception as e:
print(e)
return e
# GPT3 模型
class GPT3SummarizationModel(BaseSummarizationModel):
# 初始化
def __init__(self, model="text-davinci-003"):
self.model = model
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def summarize(self, context, max_tokens=500, stop_sequence=None):
try:
client = OpenAI()
response = client.chat.completions.create(
model=self.model,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{
"role": "user",
"content": f"Write a summary of the following, including as many key details as possible: {context}:",
},
],
max_tokens=max_tokens,
)
return response.choices[0].message.content
except Exception as e:
print(e)
return e