-
微调
大语言模型-ChatGLM-Tuning
大语言模型-微调chatglm6b
大语言模型-中文chatGLM-LLAMA微调
大语言模型-alpaca-lora -
本地知识库
大语言模型2-document ai解读
大语言模型-DocumentSearch解读
大语言模型-中文Langchain
大语言模型学习,继续还是在应用端进行学习,之后才会深入模型本身和原理。
RAG
基本模块:召回模块(向量模型,向量数据库),生成模块(prompt: context+query, 生成模型)
DocumentSearch
https://github.com/yuanzhoulvpi2017/DocumentSearch
该仓库简洁展示了如何自己写一个类似langchain的代码,大体思路如下:
- sentence transformers将文本转化为向量
- search top info选择top n 相近的结果
- 结果制作成prompt进入LLM给出最终结果
主体是一个Knowledge的类,使用时knowledge.search_result()
class KnowLedge:
def __init__(self,
global_dir: str = None,
gen_model_name_or_path: str = "THUDM/chatglm-6b",
sen_embedding_model_name_or_path: str = "hfl/chinese-roberta-wwm-ext",
batch_top_k=5
) -> None:
self.batch_top_k = batch_top_k
all_file_list = cal_detail_in_dir(global_dir)
all_file_list = [Path(i) for i in all_file_list]
all_file_list = [i for i in all_file_list if i.suffix in ['.pdf', '.docx']]
all_trans_data = [transfile(i) for i in tqdm(all_file_list)]
all_trans_data = [clean_text_data(i) for i in all_trans_data]
all_trans_data = [i for i in all_trans_data if i.text_data.shape[0] > 0]
all_trans_data = [chunk_text4TransOutput(i) for i in all_trans_data]
self.sv = SentenceVector(model_name_or_path=sen_embedding_model_name_or_path)
all_vector = [self.sv.encode_fun_plus(i.text_data['chunk_text'].tolist()) for i in all_trans_data]
self.all_trans_data = all_trans_data
self.all_vector = all_vector
self.gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_name_or_path, trust_remote_code=True)
self.gen_model = AutoModel.from_pretrained(gen_model_name_or_path, trust_remote_code=True).half().cuda(1)
def search_top_info(self, index: int, question_vector: np.ndarray) -> pd.DataFrame:
# print("".format(index))
similar_score = numpy_cos_sim(self.all_vector[index], question_vector).flatten()
if similar_score.shape[0] < self.batch_top_k:
res = self.all_trans_data[index].text_data.reset_index(drop=True).pipe(
lambda x: x.assign(**{
'score': similar_score
})
).pipe(
lambda x: x.assign(**{
'file_name': self.all_trans_data[index].file_name,
'file_path': self.all_trans_data[index].file_type
})
)
else:
top_k_location = np.argpartition(similar_score, kth=-self.batch_top_k)[-self.batch_top_k:]
res = self.all_trans_data[index].text_data.reset_index(drop=True).iloc[top_k_location].pipe(
lambda x: x.assign(**{
'score': similar_score[top_k_location]
})
).pipe(
lambda x: x.assign(**{
'file_name': self.all_trans_data[index].file_name,
'file_path': self.all_trans_data[index].file_type
})
)
return res
def search_result(self, question_str: str) -> Tuple[str, pd.DataFrame]:
# question_str ="大学生创业有什么补贴" #"做集成电路的企业,有什么补贴"#
question_vector = self.sv.encode_fun([question_str])
# question_vector.shape
# index = 0
search_table_info = pd.concat(
[self.search_top_info(index, question_vector) for index in range(len(self.all_vector))]).pipe(
lambda x: x.sort_values(by=['score'], ascending=False)
)
search_table = search_table_info.drop_duplicates(['chunk_text']).head(30)
search_text_list = search_table['chunk_text'].tolist()
# len(search_text_list), search_text_list[:3]
prompt_template = """基于以下已知信息,简洁和专业的来回答用户的问题。
如果无法从中得到答案,请说 "根据已知信息无法回答该问题" 或 "没有提供足够的相关信息",不允许在答案中添加编造成分,答案请使用中文。
问题:
{question}
已知内容:
{context}
"""
text2chatglm = prompt_template.format_map({
'question': question_str,
'context': '\n'.join(search_text_list)
})
response, history = self.gen_model.chat(self.gen_tokenizer, text2chatglm, history=[])
torch.cuda.empty_cache()
return response, search_table
将文本句子转化为向量的函数
class SentenceVector:
def __init__(self,
model_name_or_path: str = None,
device: str = "cuda:0") -> None:
self.model_name_or_path = model_name_or_path
self.device = device
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path)
self.model = AutoModel.from_pretrained(self.model_name_or_path)
self.model.to(self.device)
def encode_fun(self, texts: List[str]) -> np.ndarray:
texts = [cleanquestion(i) for i in texts]
inputs = self.tokenizer.batch_encode_plus(
texts, padding=True, truncation=True, return_tensors="pt", max_length=64)
inputs.to(device=self.device)
with t.no_grad():
embeddings = self.model(**inputs)
embeddings = embeddings.last_hidden_state[:, 0]
embeddings = embeddings.to('cpu').numpy()
return embeddings
def encode_fun_plus(self, texts: List[str], batch_size: int = 100) -> np.ndarray:
embeddings = np.concatenate([self.encode_fun(
texts[i:(i + batch_size)]) for i in tqdm(range(0, len(texts), batch_size))])
return embeddings
其中的LLM可以采用开源做成线下
self.gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_name_or_path, trust_remote_code=True)
self.gen_model = AutoModel.from_pretrained(gen_model_name_or_path, trust_remote_code=True).half().cuda(1)
生成句子embedd: “hfl/chinese-roberta-wwm-ext”
LLM: “THUDM/chatglm-6b”
webui
webui也是最近火爆的agic必备
作者采用了streamlit来实现