在魔搭平台notebook中配置
pip install pypdf faiss-gpu langchain langchain_community langchain_huggingface streamlit==1.24.0
pip uninstall keras
pip install tf-keras
pip install unstructured
pip install pypandoc
pip install pandoc
更新
sudo apt-get install pandoc
启动
streamlit run AI阅读陪伴.py --server.address 127.0.0.1 --server.port 6006
# 导入所需的库
import torch
import streamlit as st
from ebooklib import epub
from langchain_community.document_loaders import UnstructuredEPubLoader
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain.prompts import PromptTemplate
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chains import LLMChain
from langchain.chains.question_answering import load_qa_chain
from langchain.llms.base import LLM
from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import Any, List, Optional
import ebooklib
from ebooklib import epub
# 向量模型下载
from modelscope import snapshot_download
model_dir = snapshot_download('AI-ModelScope/bge-small-en-v1.5', cache_dir='./')
# 源大模型下载
from modelscope import snapshot_download
model_dir = snapshot_download('IEITYuan/Yuan2-2B-Mars-hf', cache_dir='./')
# model_dir = snapshot_download('IEITYuan/Yuan2-2B-July-hf', cache_dir='./')
# 定义模型路径
model_path = './IEITYuan/Yuan2-2B-Mars-hf'
# path = './IEITYuan/Yuan2-2B-July-hf'
# 定义向量模型路径
embedding_model_path = './AI-ModelScope/bge-small-en-v1___5'
# 定义模型数据类型
torch_dtype = torch.bfloat16 # A10
# torch_dtype = torch.float16 # P100
# 定义源大模型类
class Yuan2_LLM(LLM):
"""
class for Yuan2_LLM
"""
tokenizer: AutoTokenizer = None
model: AutoModelForCausalLM = None
def __init__(self, mode_path :str):
super().__init__()
# 加载预训练的分词器和模型
print("Creat tokenizer...")
self.tokenizer = AutoTokenizer.from_pretrained(mode_path, add_eos_token=False, add_bos_token=False, eos_token='<eod>')
self.tokenizer.add_tokens(['<sep>', '<pad>', '<mask>', '<predict>', '<FIM_SUFFIX>', '<FIM_PREFIX>', '<FIM_MIDDLE>','<commit_before>','<commit_msg>','<commit_after>','<jupyter_start>','<jupyter_text>','<jupyter_code>','<jupyter_output>','<empty_output>'], special_tokens=True)
print("Creat model...")
self.model = AutoModelForCausalLM.from_pretrained(mode_path, torch_dtype=torch.bfloat16, trust_remote_code=True).cuda()
def _call(
self,
prompt: str,
stop: Optional[List[str]] = None,
run_manager: Optional[CallbackManagerForLLMRun] = None,
**kwargs: Any,
) -> str:
prompt = prompt.strip()
prompt += "<sep>"
inputs = self.tokenizer(prompt, return_tensors="pt")["input_ids"].cuda()
outputs = self.model.generate(inputs,do_sample=False,max_length=4096)
output = self.tokenizer.decode(outputs[0])
response = output.split("<sep>")[-1].split("<eod>")[0]
return response
@property
def _llm_type(self) -> str:
return "Yuan2_LLM"
# 定义一个函数,用于获取llm和embeddings
@st.cache_resource
def get_models():
llm = Yuan2_LLM(model_path)
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
embeddings = HuggingFaceEmbeddings(
model_name=embedding_model_path,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs,
)
return llm, embeddings
chatbot_template = '''
假设你是一个AI阅读助手,请基于背景,简要回答问题。
背景:
{context}
问题:
{question}
'''.strip()
# 定义ChatBot类
class ChatBot:
"""
class for ChatBot.
"""
def __init__(self, llm, embeddings):
self.prompt = PromptTemplate(
input_variables=["text"],
template=chatbot_template
)
self.chain = load_qa_chain(llm=llm, chain_type="stuff", prompt=self.prompt)
self.embeddings = embeddings
# 加载 text_splitter
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=450,
chunk_overlap=10,
length_function=len
)
def run(self, docs, query):
# 读取所有内容
text = ''.join([doc.page_content for doc in docs])
# 切分成chunks
all_chunks = self.text_splitter.split_text(text=text)
# 转成向量并存储
VectorStore = FAISS.from_texts(all_chunks, embedding=self.embeddings)
# 检索相似的chunks
chunks = VectorStore.similarity_search(query=query, k=1)
# 生成回复
response = self.chain.run(input_documents=chunks, question=query)
return chunks, response
def main():
# 创建一个标题
st.title('💬 Yuan2.0 你的阅读搭子')
# 获取llm和embeddings
llm, embeddings = get_models()
# 初始化ChatBot
chatbot = ChatBot(llm, embeddings)
# 上传文件
uploaded_file = st.file_uploader("Upload your PDF or EPUB", type=['pdf', 'epub'])
if uploaded_file:
# 确定文件类型
file_type = uploaded_file.type.split('/')[-1]
# 将文件内容写入临时文件
temp_file_path = "temp." + file_type
with open(temp_file_path, "wb") as temp_file:
temp_file.write(uploaded_file.read())
# 根据文件类型加载内容
if file_type == 'pdf':
loader = PyPDFLoader(temp_file_path)
docs = loader.load()
elif file_type == 'epub':
loader = epub.read_epub(temp_file_path)
docs = loader
#docs = loader.load()
st.chat_message("assistant").write(f"文档加载中,请稍候...")
# 在聊天界面上显示模型的输出
st.chat_message("assistant").write(f"很高兴和您一起阅读,请输入问题...")
# 接收用户问题
if query := st.text_input("Ask questions about your file"):
# 检索 + 生成回复
chunks, response = chatbot.run(docs, query)
# 在聊天界面上显示模型的输出
st.chat_message("assistant").write(f"正在检索相关信息,请稍候...")
st.chat_message("assistant").write(chunks)
st.chat_message("assistant").write(f"正在生成回复,请稍候...")
st.chat_message("assistant").write(response)
if __name__ == '__main__':
main()