ollama和RAG的本地运行_ollama rag 本地-CSDN博客

本文链接：https://blog.csdn.net/weixin_52454379/article/details/142650494

继上周刚刚入手的RAG，这周继续沿用上周的思想模块，做了一个这样已读乱回的这个样子，还请有大佬帮忙指教。参考的是GitHub上的tiny-universe: 大模型白盒子构建指南，fork自datawhalechina

然后，流程基本和上周一致，下面给出全部代码和示例。

总体文件夹

下面给出每个模块代码

1、文件读取工具模块

import os
from typing import Dict, List, Optional, Tuple, Union

import PyPDF2
import markdown
import html2text
import json
from tqdm import tqdm
import tiktoken
from bs4 import BeautifulSoup
import re

enc = tiktoken.get_encoding("cl100k_base")


class ReadFiles:
    """
    class to read files
    """

    def __init__(self, path: str) -> None:
        self._path = path
        self.file_list = self.get_files()

    def get_files(self):
        # args：dir_path，目标文件夹路径
        file_list = []
        for filepath, dirnames, filenames in os.walk(self._path):
            # os.walk 函数将递归遍历指定文件夹
            for filename in filenames:
                # 通过后缀名判断文件类型是否满足要求
                if filename.endswith(".md"):
                    # 如果满足要求，将其绝对路径加入到结果列表
                    file_list.append(os.path.join(filepath, filename))
                elif filename.endswith(".txt"):
                    file_list.append(os.path.join(filepath, filename))
                elif filename.endswith(".pdf"):
                    file_list.append(os.path.join(filepath, filename))
        return file_list

    def get_content(self, max_token_len: int = 600, cover_content: int = 150):
        docs = []
        # 读取文件内容
        for file in self.file_list:
            content = self.read_file_content(file)
            chunk_content = self.get_chunk(
                content, max_token_len=max_token_len, cover_content=cover_content)
            docs.extend(chunk_content)
        return docs

    @classmethod
    def get_chunk(cls, text: str, max_token_len: int = 600, cover_content: int = 150):
        chunk_text = []

        curr_len = 0
        curr_chunk = ''

        token_len = max_token_len - cover_content
        lines = text.splitlines()  # 假设以换行符分割文本为行

        for line in lines:
            line = line.replace(' ', '')
            line_len = len(enc.encode(line))
            if line_len > max_token_len:
                # 如果单行长度就超过限制，则将其分割成多个块
                num_chunks = (line_len + token_len - 1) // token_len
                for i in range(num_chunks):
                    start = i * token_len
                    end = start + token_len
                    # 避免跨单词分割
                    while not line[start:end].rstrip().isspace():
                        start += 1
                        end += 1
                        if start >= line_len:
                            break
                    curr_chunk = curr_chunk[-cover_content:] + line[start:end]
                    chunk_text.append(curr_chunk)
                # 处理最后一个块
                start = (num_chunks - 1) * token_len
                curr_chunk = curr_chunk[-cover_content:] + line[start:end]
                chunk_text.append(curr_chunk)

            if curr_len + line_len <= token_len:
                curr_chunk += line
                curr_chunk += '\n'
                curr_len += line_len
                curr_len += 1
            else:
                chunk_text.append(curr_chunk)
                curr_chunk = curr_chunk[-cover_content:] + line
                curr_len = line_len + cover_content

        if curr_chunk:
            chunk_text.append(curr_chunk)

        return chunk_text

    @classmethod
    def read_file_content(cls, file_path: str):
        # 根据文件扩展名选择读取方法
        if file_path.endswith('.pdf'):
            return cls.read_pdf(file_path)
        elif file_path.endswith('.md'):
            return cls.read_markdown(file_path)
        elif file_path.endswith('.txt'):
            return cls.read_text(file_path)
        else:
            raise ValueError("Unsupported file type")

    @classmethod
    def read_pdf(cls, file_path: str):
        # 读取PDF文件
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page_num in range(len(reader.pages)):
                text += reader.pages[page_num].extract_text()
            return text

    @classmethod
    def read_markdown(cls, file_path: str):
        # 读取Markdown文件
        with open(file_path, 'r', encoding='utf-8') as file:
            md_text = file.read()
            html_text = markdown.markdown(md_text)
            # 使用BeautifulSoup从HTML中提取纯文本
            soup = BeautifulSoup(html_text, 'html.parser')
            plain_text = soup.get_text()
            # 使用正则表达式移除网址链接
            text = re.sub(r'http\S+', '', plain_text)
            return text

    @classmethod
    def read_text(cls, file_path: str):
        # 读取文本文件
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()


class Documents:
    """
        获取已分好类的json格式文档
    """

    def __init__(self, path: str = '') -> None:
        self.path = path

    def get_content(self):
        with open(self.path, mode='r', encoding='utf-8') as f:
            content = json.load(f)
        return content

2、文档嵌入模块

from typing import List

import numpy as np


class LocalEmbedding:
    def __init__(self, path: str) -> None:
        self.path = path
        self._model = self.load_model()

    def load_model(self):
        import torch
        from sentence_transformers import SentenceTransformer
        if torch.cuda.is_available():
            device = torch.device("cuda")
        else:
            device = torch.device("cpu")
        model = SentenceTransformer(self.path, device=device, trust_remote_code=True)
        return model

    def get_embedding(self, text: str) -> List[float]:
        return self._model.encode([text])[0].tolist()

    @staticmethod
    def cosine_similarity(vector1: List[float], vector2: List[float]) -> float:
        """
        calculate cosine similarity between two vectors
        """
        dot_product = np.dot(vector1, vector2)
        magnitude = np.linalg.norm(vector1) * np.linalg.norm(vector2)
        if not magnitude:
            return 0
        return dot_product / magnitude

3、文档向量化模块

import os
from typing import Dict, List, Optional, Tuple, Union
import json
from embedding import LocalEmbedding
import numpy as np
from tqdm import tqdm


class VectorStore:
    def __init__(self, document: List[str] = ['']) -> None:
        self.document = document

    def get_vector(self, EmbeddingModel: LocalEmbedding) -> List[List[float]]:

        self.vectors = []
        for doc in tqdm(self.document, desc="Calculating embeddings"):
            self.vectors.append(EmbeddingModel.get_embedding(doc))
        return self.vectors

    def persist(self, path: str = 'ollama-python-main/zdf/storage'):
        if not os.path.exists(path):
            os.makedirs(path)
        with open(f"{path}/doecment.json", 'w', encoding='utf-8') as f:
            json.dump(self.document, f, ensure_ascii=False)
        if self.vectors:
            with open(f"{path}/vectors.json", 'w', encoding='utf-8') as f:
                json.dump(self.vectors, f)

    def load_vector(self, path: str = 'ollama-python-main/zdf/storage'):
        with open(f"{path}/vectors.json", 'r', encoding='utf-8') as f:
            self.vectors = json.load(f)
        with open(f"{path}/doecment.json", 'r', encoding='utf-8') as f:
            self.document = json.load(f)

    def get_similarity(self, vector1: List[float], vector2: List[float]) -> float:
        return LocalEmbedding.cosine_similarity(vector1, vector2)

    def query(self, query: str, EmbeddingModel: LocalEmbedding, k: int = 1) -> List[str]:
        query_vector = EmbeddingModel.get_embedding(query)
        result = np.array([self.get_similarity(query_vector, vector)
                           for vector in self.vectors])
        return np.array(self.document)[result.argsort()[-k:][::-1]].tolist()

4、rerank模块

from typing import List
import numpy as np


class BaseReranker:
    """
    Base class for reranker
    """

    def __init__(self, path: str) -> None:
        self.path = path

    def rerank(self, text: str, content: List[str], k: int) -> List[str]:
        raise NotImplementedError


class BgeReranker(BaseReranker):
    """
    class for Bge reranker
    """

    def __init__(self, path: str = 'ollama-python-main/zdf/bce-reranker-base_v1') -> None:
        super().__init__(path)
        self._model, self._tokenizer = self.load_model(path)

    def rerank(self, text: str, content: List[str], k: int) -> List[str]:
        import torch
        pairs = [(text, c) for c in content]
        with torch.no_grad():
            inputs = self._tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)
            inputs = {k: v.to(self._model.device) for k, v in inputs.items()}
            scores = self._model(**inputs, return_dict=True).logits.view(-1, ).float()
            index = np.argsort(scores.tolist())[-k:][::-1]
        return [content[i] for i in index]

    def load_model(self, path: str):
        import torch
        from transformers import AutoModelForSequenceClassification, AutoTokenizer
        if torch.cuda.is_available():
            device = torch.device("cuda")
        else:
            device = torch.device("cpu")
        tokenizer = AutoTokenizer.from_pretrained(path)
        model = AutoModelForSequenceClassification.from_pretrained(path).to(device)
        model.eval()
        return model, tokenizer

5、大模型模块

from typing import List, Tuple, Dict
import requests

PROMPT_TEMPLATE = """
你是一个能熟悉掌握文言文的专家。请仔细阅读以下问题和提供的相关文言文段落。根据问题内容，找出答案所在的段落，并用现代汉语总结给出答案。
问题: {question}
文言文参考段落：
···
{context}
···
如果给定的文言文段落中没有足够的信息回答问题，请直接回答“文中未提及相关信息”。
请给出你的回答：
"""

class BaseModel:
    def __init__(self, path: str = '') -> None:
        self.path = path

    def chat(self, prompt: str, history: List[dict], content: str) -> str:
        pass

    def load_model(self):
        pass

class OllamaChat:
    def __init__(self, model: str = "llama3.1") -> None:
        self.model = model

    def _build_messages(self, prompt: str, content: str):
        prompt_message = PROMPT_TEMPLATE.format(question=prompt, context=content)
        messages = [
            {"role": "system", "content": "你是一个AI助手，专注于理解和回答文言文问题。"},
            {"role": "user", "content": prompt_message}
        ]
        return messages

    def chat(self, prompt: str, history: List[Dict], content: str) -> str:
        import ollama
        # 给语言模型发送请求
        response = ollama.chat(
            model=self.model,
            messages=self._build_messages(prompt, content),
            stream=True
        )
        # 解析并组装响应结果
        final_response = ''
        for chunk in response:
            if isinstance(chunk, str):
                final_response += chunk
            elif 'content' in chunk.get('message', {}):
                final_response += chunk['message']['content']

        return final_response

二、测试

1、刚做的时候是没有向量库的，我们用这个代码

from vectorstore import VectorStore
from utils import ReadFiles
from model import OllamaChat
from embedding import LocalEmbedding


# 没有保存数据库
docs = ReadFiles('ollama-python-main/zdf/data').get_content(max_token_len=600, cover_content=150) # 获得data目录下的所有文件内容并分割
vector = VectorStore(docs)
embedding = LocalEmbedding("ollama-python-main/zdf/maidalun1020/bce-embedding-base_v1") # 创建EmbeddingModel
vector.get_vector(EmbeddingModel=embedding)
vector.persist(path='ollama-python-main/zdf/storage') # 将向量和文档内容保存到storage目录下，下次再用就可以直接加载本地的数据库

question = '三国演义第二回讲述了什么内容？'

content = vector.query(question, EmbeddingModel=embedding, k=1)[0]
chat = OllamaChat(model='llama3.1')
print(chat.chat(question, [], content))


# 保存数据库之后
# vector = VectorStore()
# 
# vector.load_vector('ollama-python-main/zdf/storage') # 加载本地的数据库
# 
# question = '深度学习有哪些应用场景？'
# 
# embedding = LocalEmbedding("ollama-python-main/zdf/maidalun1020/bce-embedding-base_v1") # 创建EmbeddingModel
# 
# content = vector.query(question, EmbeddingModel=embedding, k=1)[0]
# chat = OllamaChat(model='llama3.1')
# print(chat.chat(question, [], content))

2、做完向量库入库后，用如下代码

from vectorstore import VectorStore
from utils import ReadFiles
from model import OllamaChat
from embedding import LocalEmbedding


# 没有保存数据库
docs = ReadFiles('ollama-python-main/zdf/data').get_content(max_token_len=600, cover_content=150) # 获得data目录下的所有文件内容并分割
vector = VectorStore(docs)
embedding = LocalEmbedding("ollama-python-main/zdf/maidalun1020/bce-embedding-base_v1") # 创建EmbeddingModel
vector.get_vector(EmbeddingModel=embedding)
vector.persist(path='ollama-python-main/zdf/storage') # 将向量和文档内容保存到storage目录下，下次再用就可以直接加载本地的数据库

question = '三国演义第二回讲述了什么内容？'

content = vector.query(question, EmbeddingModel=embedding, k=1)[0]
chat = OllamaChat(model='llama3.1')
print(chat.chat(question, [], content))


# 保存数据库之后
# vector = VectorStore()
# 
# vector.load_vector('ollama-python-main/zdf/storage') # 加载本地的数据库
# 
# question = '深度学习有哪些应用场景？'
# 
# embedding = LocalEmbedding("ollama-python-main/zdf/maidalun1020/bce-embedding-base_v1") # 创建EmbeddingModel
# 
# content = vector.query(question, EmbeddingModel=embedding, k=1)[0]
# chat = OllamaChat(model='llama3.1')
# print(chat.chat(question, [], content))

3、给大家看看结果哈，纯属是已读乱回，我要哭死！！