AI阅读陪伴

最新推荐文章于 2024-08-27 23:50:44 发布

Ghhhhhggg

最新推荐文章于 2024-08-27 23:50:44 发布

阅读量67

点赞数 2

文章标签：深度学习 python 人工智能

本文链接：https://blog.csdn.net/Ghhhhhggg/article/details/141537359

版权

在魔搭平台notebook中配置

pip install pypdf faiss-gpu langchain langchain_community langchain_huggingface streamlit==1.24.0

pip uninstall keras
pip install tf-keras
pip install unstructured
pip install pypandoc
pip install pandoc

更新

sudo apt-get install pandoc

启动

streamlit run AI阅读陪伴.py --server.address 127.0.0.1 --server.port 6006

# 导入所需的库
import torch
import streamlit as st
from ebooklib import epub
from langchain_community.document_loaders import UnstructuredEPubLoader
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain.prompts import PromptTemplate
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chains import LLMChain
from langchain.chains.question_answering import load_qa_chain
from langchain.llms.base import LLM
from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.text_splitter import RecursiveCharacterTextSplitter

from typing import Any, List, Optional

import ebooklib
from ebooklib import epub

# 向量模型下载
from modelscope import snapshot_download
model_dir = snapshot_download('AI-ModelScope/bge-small-en-v1.5', cache_dir='./')

# 源大模型下载
from modelscope import snapshot_download
model_dir = snapshot_download('IEITYuan/Yuan2-2B-Mars-hf', cache_dir='./')
# model_dir = snapshot_download('IEITYuan/Yuan2-2B-July-hf', cache_dir='./')

# 定义模型路径
model_path = './IEITYuan/Yuan2-2B-Mars-hf'
# path = './IEITYuan/Yuan2-2B-July-hf'

# 定义向量模型路径
embedding_model_path = './AI-ModelScope/bge-small-en-v1___5'

# 定义模型数据类型
torch_dtype = torch.bfloat16 # A10
# torch_dtype = torch.float16 # P100


# 定义源大模型类
class Yuan2_LLM(LLM):
    """
    class for Yuan2_LLM
    """
    tokenizer: AutoTokenizer = None
    model: AutoModelForCausalLM = None

    def __init__(self, mode_path :str):
        super().__init__()

        # 加载预训练的分词器和模型
        print("Creat tokenizer...")
        self.tokenizer = AutoTokenizer.from_pretrained(mode_path, add_eos_token=False, add_bos_token=False, eos_token='<eod>')
        self.tokenizer.add_tokens(['<sep>', '<pad>', '<mask>', '<predict>', '<FIM_SUFFIX>', '<FIM_PREFIX>', '<FIM_MIDDLE>','<commit_before>','<commit_msg>','<commit_after>','<jupyter_start>','<jupyter_text>','<jupyter_code>','<jupyter_output>','<empty_output>'], special_tokens=True)

        print("Creat model...")
        self.model = AutoModelForCausalLM.from_pretrained(mode_path, torch_dtype=torch.bfloat16, trust_remote_code=True).cuda()

    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> str:
        prompt = prompt.strip()
        prompt += "<sep>"
        inputs = self.tokenizer(prompt, return_tensors="pt")["input_ids"].cuda()
        outputs = self.model.generate(inputs,do_sample=False,max_length=4096)
        output = self.tokenizer.decode(outputs[0])
        response = output.split("<sep>")[-1].split("<eod>")[0]

        return response

    @property
    def _llm_type(self) -> str:
        return "Yuan2_LLM"



# 定义一个函数，用于获取llm和embeddings
@st.cache_resource
def get_models():
    llm = Yuan2_LLM(model_path)

    model_kwargs = {'device': 'cuda'}
    encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
    embeddings = HuggingFaceEmbeddings(
        model_name=embedding_model_path,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs,
    )
    return llm, embeddings



chatbot_template  = '''
假设你是一个AI阅读助手，请基于背景，简要回答问题。

背景：
{context}

问题：
{question}
'''.strip()


# 定义ChatBot类
class ChatBot:
    """
    class for ChatBot.
    """

    def __init__(self, llm, embeddings):
        self.prompt = PromptTemplate(
            input_variables=["text"],
            template=chatbot_template
        )
        self.chain = load_qa_chain(llm=llm, chain_type="stuff", prompt=self.prompt)
        self.embeddings = embeddings

        # 加载 text_splitter
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=450,
            chunk_overlap=10,
            length_function=len
        )

    def run(self, docs, query):
        # 读取所有内容
        text = ''.join([doc.page_content for doc in docs])

        # 切分成chunks
        all_chunks = self.text_splitter.split_text(text=text)

        # 转成向量并存储
        VectorStore = FAISS.from_texts(all_chunks, embedding=self.embeddings)

        # 检索相似的chunks
        chunks = VectorStore.similarity_search(query=query, k=1)

        # 生成回复
        response = self.chain.run(input_documents=chunks, question=query)

        return chunks, response


def main():
    # 创建一个标题
    st.title('💬 Yuan2.0 你的阅读搭子')

    # 获取llm和embeddings
    llm, embeddings = get_models()


    # 初始化ChatBot
    chatbot = ChatBot(llm, embeddings)

    # 上传文件
    uploaded_file = st.file_uploader("Upload your PDF or EPUB", type=['pdf', 'epub'])

    if uploaded_file:
        # 确定文件类型
        file_type = uploaded_file.type.split('/')[-1]
        
        # 将文件内容写入临时文件
        temp_file_path = "temp." + file_type
        with open(temp_file_path, "wb") as temp_file:
            temp_file.write(uploaded_file.read())
        
        # 根据文件类型加载内容
        if file_type == 'pdf':
            loader = PyPDFLoader(temp_file_path)
            docs = loader.load()
        elif file_type == 'epub':
            loader = epub.read_epub(temp_file_path)
            docs = loader
            
        #docs = loader.load()
        
        st.chat_message("assistant").write(f"文档加载中，请稍候...")

        # 在聊天界面上显示模型的输出
        st.chat_message("assistant").write(f"很高兴和您一起阅读，请输入问题...")

        # 接收用户问题
        if query := st.text_input("Ask questions about your  file"):

            # 检索 + 生成回复
            chunks, response = chatbot.run(docs, query)

            # 在聊天界面上显示模型的输出
            st.chat_message("assistant").write(f"正在检索相关信息，请稍候...")
            st.chat_message("assistant").write(chunks)

            st.chat_message("assistant").write(f"正在生成回复，请稍候...")
            st.chat_message("assistant").write(response)


if __name__ == '__main__':
    main()