大语言模型-DocumentSearch解读_document search-CSDN博客

本文链接：https://blog.csdn.net/weixin_38812492/article/details/130239472

大语言模型学习，继续还是在应用端进行学习，之后才会深入模型本身和原理。

RAG
基本模块：召回模块（向量模型，向量数据库），生成模块（prompt: context+query, 生成模型）

DocumentSearch

https://github.com/yuanzhoulvpi2017/DocumentSearch

该仓库简洁展示了如何自己写一个类似langchain的代码，大体思路如下：

sentence transformers将文本转化为向量
search top info选择top n 相近的结果
结果制作成prompt进入LLM给出最终结果

主体是一个Knowledge的类，使用时knowledge.search_result()

class KnowLedge:
    def __init__(self,
                 global_dir: str = None,
                 gen_model_name_or_path: str = "THUDM/chatglm-6b",
                 sen_embedding_model_name_or_path: str = "hfl/chinese-roberta-wwm-ext",
                 batch_top_k=5
                 ) -> None:

        self.batch_top_k = batch_top_k

        all_file_list = cal_detail_in_dir(global_dir)
        all_file_list = [Path(i) for i in all_file_list]
        all_file_list = [i for i in all_file_list if i.suffix in ['.pdf', '.docx']]
        all_trans_data = [transfile(i) for i in tqdm(all_file_list)]
        all_trans_data = [clean_text_data(i) for i in all_trans_data]
        all_trans_data = [i for i in all_trans_data if i.text_data.shape[0] > 0]

        all_trans_data = [chunk_text4TransOutput(i) for i in all_trans_data]

        self.sv = SentenceVector(model_name_or_path=sen_embedding_model_name_or_path)

        all_vector = [self.sv.encode_fun_plus(i.text_data['chunk_text'].tolist()) for i in all_trans_data]

        self.all_trans_data = all_trans_data
        self.all_vector = all_vector

        self.gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_name_or_path, trust_remote_code=True)
        self.gen_model = AutoModel.from_pretrained(gen_model_name_or_path, trust_remote_code=True).half().cuda(1)

    def search_top_info(self, index: int, question_vector: np.ndarray) -> pd.DataFrame:
        # print("".format(index))
        similar_score = numpy_cos_sim(self.all_vector[index], question_vector).flatten()

        if similar_score.shape[0] < self.batch_top_k:
            res = self.all_trans_data[index].text_data.reset_index(drop=True).pipe(
                lambda x: x.assign(**{
                    'score': similar_score
                })
            ).pipe(
                lambda x: x.assign(**{
                    'file_name': self.all_trans_data[index].file_name,
                    'file_path': self.all_trans_data[index].file_type
                })
            )

        else:

            top_k_location = np.argpartition(similar_score, kth=-self.batch_top_k)[-self.batch_top_k:]

            res = self.all_trans_data[index].text_data.reset_index(drop=True).iloc[top_k_location].pipe(
                lambda x: x.assign(**{
                    'score': similar_score[top_k_location]
                })
            ).pipe(
                lambda x: x.assign(**{
                    'file_name': self.all_trans_data[index].file_name,
                    'file_path': self.all_trans_data[index].file_type
                })
            )

        return res

    def search_result(self, question_str: str) -> Tuple[str, pd.DataFrame]:

        # question_str ="大学生创业有什么补贴" #"做集成电路的企业,有什么补贴"#
        question_vector = self.sv.encode_fun([question_str])
        # question_vector.shape
        # index = 0

        search_table_info = pd.concat(
            [self.search_top_info(index, question_vector) for index in range(len(self.all_vector))]).pipe(
            lambda x: x.sort_values(by=['score'], ascending=False)
        )
        search_table = search_table_info.drop_duplicates(['chunk_text']).head(30)

        search_text_list = search_table['chunk_text'].tolist()
        # len(search_text_list), search_text_list[:3]

        prompt_template = """基于以下已知信息，简洁和专业的来回答用户的问题。
        如果无法从中得到答案，请说 "根据已知信息无法回答该问题" 或 "没有提供足够的相关信息"，不允许在答案中添加编造成分，答案请使用中文。
        问题:
        {question}
        已知内容:
        {context}
        """

        text2chatglm = prompt_template.format_map({
            'question': question_str,
            'context': '\n'.join(search_text_list)
        })

        response, history = self.gen_model.chat(self.gen_tokenizer, text2chatglm, history=[])
        torch.cuda.empty_cache()

        return response, search_table

将文本句子转化为向量的函数

class SentenceVector:
    def __init__(self,
                 model_name_or_path: str = None,
                 device: str = "cuda:0") -> None:
        self.model_name_or_path = model_name_or_path
        self.device = device

        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)

        self.model = AutoModel.from_pretrained(self.model_name_or_path)
        self.model.to(self.device)

    def encode_fun(self, texts: List[str]) -> np.ndarray:
        texts = [cleanquestion(i) for i in texts]

        inputs = self.tokenizer.batch_encode_plus(
            texts, padding=True, truncation=True, return_tensors="pt", max_length=64)
        inputs.to(device=self.device)
        with t.no_grad():
            embeddings = self.model(**inputs)

        embeddings = embeddings.last_hidden_state[:, 0]
        embeddings = embeddings.to('cpu').numpy()
        return embeddings

    def encode_fun_plus(self, texts: List[str], batch_size: int = 100) -> np.ndarray:
        embeddings = np.concatenate([self.encode_fun(
            texts[i:(i + batch_size)]) for i in tqdm(range(0, len(texts), batch_size))])
        return embeddings

其中的LLM可以采用开源做成线下

self.gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_name_or_path, trust_remote_code=True)
self.gen_model = AutoModel.from_pretrained(gen_model_name_or_path, trust_remote_code=True).half().cuda(1)

生成句子embedd: “hfl/chinese-roberta-wwm-ext”
LLM: “THUDM/chatglm-6b”