本地化部署Fastgpt+One-API+ChatGLM3-6b知识库

本文链接：https://blog.csdn.net/lachesis617/article/details/136275517

1.第一步，本地模型搭建

.ChatGLM3-6b代码
https://github.com/THUDM/ChatGLM3

2.FastGPT
https://github.com/labring/FastGPT

3.向量模型m3e，可以通过docker部署
https://huggingface.co/moka-ai/m3e-base

先安装Anaconda3Index of /anaconda/archive/ | 清华大学开源软件镜像站 | Tsinghua Open Source Mirror

先创建名字是chatglm3-demo的环境

conda create -n chatglm3-demo python=3.10
conda activate chatglm3-demo

chatglm3-6b需要Python 3.10以上。

cd /ChatGLM3-main/composite_demo

换到demo下测试模型，先安装requirements.txt，可以先把torch删除单独安装。

pip install -r requirements.txt

Code Interpreter 还需要安装 Jupyter 内核。新建系统变量名=IPYKERNEL。值=chatglm3-demo

ipython kernel install --name chatglm3-demo --user

安装pytorch和torchvision的对应版本

torch-2.0.0+cu117-cp310-cp310-win_amd64.whl
torchvision-0.15.0+cu117-cp310-cp310-win_amd64.whl

在下载包的位置安装

pip install torch-2.0.0+cu117-cp310-cp310-win_amd64.whl
pip install torchvision-0.15.0+cu117-cp310-cp310-win_amd64.whl

可在网站搜索对应版本https://download.pytorch.org/whl/

验证是否使用GPU学习

import torch
from transformers import __version__ as transformers_version
import torchvision

print("PyTorch VER:", torch.__version__)
print("Transformers version:", transformers_version)
print("TorchVision version:", torchvision.__version__)

# 检查是否有可用的 GPU
if torch.cuda.is_available():
    print("CUDA version:", torch.version.cuda)
    print("GPU TRUE")
else:
    print("GPU FALSE")

# 检查其他库的版本
# 这里可以添加其他库的检查

打开composite_demo/client.py修改模型位置

运行命令测试，缺少什么模块就安装

streamlit run main.py

如果显存低于12G，回答响应太慢，改变量化模型后，可以正常对话。

正常对话，代码和环境都可以运行。

切换到D:\...\ChatGLM3-main\openai_api_demo

修改openai_api.py使用chatglm3模型位置

如果有其他模型，放在一个目录

上postman测试。请求体测试。

{
  "model": "string",
  "messages": [
    {
      "role": "user",
      "content": "你好",
      "name": "string",
      "function_call": {
        "name": "string",
        "arguments": "string"
      }
    }
  ],
  "temperature": 0.8,
  "top_p": 0.8,
  "max_tokens": 0,
  "stream": false,
  "functions": {},
  "repetition_penalty": 1.1
}

成功后运行

python openai_api.py

可替换openai_api.py代码

# coding=utf-8
# Implements API for ChatGLM3-6B in OpenAI's format. (https://platform.openai.com/docs/api-reference/chat)
# Usage: python openai_api.py
# Visit http://localhost:8000/docs for documents.

# 在OpenAI的API中，max_tokens 等价于 HuggingFace 的 max_new_tokens 而不是 max_length，
# 例如，对于6b模型，设置max_tokens = 8192，则会报错，因为扣除历史记录和提示词后，模型不能输出那么多的tokens。

import os
import time
import json
from contextlib import asynccontextmanager
from typing import List, Literal, Optional, Union

import torch
from torch.cuda import get_device_properties
import uvicorn
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from loguru import logger
from pydantic import BaseModel, Field
from sse_starlette.sse import EventSourceResponse
from transformers import AutoTokenizer, AutoModel
from utils import process_response, generate_chatglm3, generate_stream_chatglm3

MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM_chatglm3-6b')
TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", MODEL_PATH)
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'


@asynccontextmanager
async def lifespan(app: FastAPI):  # collects GPU memory
    yield
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()


app = FastAPI(lifespan=lifespan)

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)


class ModelCard(BaseModel):
    id: str
    object: str = "model"
    created: int = Field(default_factory=lambda: int(time.time()))
    owned_by: str = "owner"
    root: Optional[str] = None
    parent: Optional[str] = None
    permission: Optional[list] = None


class ModelList(BaseModel):
    object: str = "list"
    data: List[ModelCard] = []


class FunctionCallResponse(BaseModel):
    name: Optional[str] = None
    arguments: Optional[str] = None


class ChatMessage(BaseModel):
    role: Literal["user", "assistant", "system", "function"]
    content: str = None
    name: Optional[str] = None
    function_call: Optional[FunctionCallResponse] = None


class DeltaMessage(BaseModel):
    role: Optional[Literal["user", "assistant", "system"]] = None
    content: Optional[str] = None
    function_call: Optional[FunctionCallResponse] = None


class ChatCompletionRequest(BaseModel):
    model: str
    messages: List[ChatMessage]
    temperature: Optional[float] = 0.8
    top_p: Optional[float] = 0.8
    max_tokens: Optional[int] = None
    stream: Optional[bool] = False
    functions: Optional[Union[dict, List[dict]]] = None
    # Additional parameters
    repetition_penalty: Optional[float] = 1.1


class ChatCompletionResponseChoice(BaseModel):
    index: int
    message: ChatMessage
    finish_reason: Literal["stop", "length", "function_call"]


class ChatCompletionResponseStreamChoice(BaseModel):
    index: int
    delta: DeltaMessage
    finish_reason: Optional[Literal["stop", "length", "function_call"]]


class UsageInfo(BaseModel):
    prompt_tokens: int = 0
    total_tokens: int = 0
    completion_tokens: Optional[int] = 0


class ChatCompletionResponse(BaseModel):
    model: str
    object: Literal["chat.completion", "chat.completion.chunk"]
    choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]]
    created: Optional[int] = Field(default_factory=lambda: int(time.time()))
    usage: Optional[UsageInfo] = None


@app.get("/v1/models", response_model=ModelList)
async def list_models():
    model_card = ModelCard(id="chatglm3-6b")
    return ModelList(data=[model_card])


@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def create_chat_completion(request: ChatCompletionRequest):
    global model, tokenizer

    if len(request.messages) < 1 or request.messages[-1].role == "assistant":
        raise HTTPException(status_code=400, detail="Invalid request")

    gen_params = dict(
        messages=request.messages,
        temperature=request.temperature,
        top_p=request.top_p,
        max_tokens=request.max_tokens or 1024,
        echo=False,
        stream=request.stream,
        repetition_penalty=request.repetition_penalty,
        functions=request.functions,
    )

    logger.debug(f"==== request ====\n{gen_params}")

    if request.stream:

        # Use the stream mode to read the first few characters, if it is not a function call, direct stram output
        predict_stream_generator = predict_stream(request.model, gen_params)
        output = next(predict_stream_generator)
        if not contains_custom_function(output):
            return EventSourceResponse(predict_stream_generator, media_type="text/event-stream")

        # Obtain the result directly at one time and determine whether tools needs to be called.
        logger.debug(f"First result output：\n{output}")

        function_call = None
        if output and request.functions:
            try:
                function_call = process_response(output, use_tool=True)
            except:
                logger.warning("Failed to parse tool call")

        # CallFunction
        if isinstance(function_call, dict):
            function_call = FunctionCallResponse(**function_call)


            """
            In this demo, we did not register any tools.
            You can use the tools that have been implemented in our `tool_using` and implement your own streaming tool implementation here.
            Similar to the following method:
                function_args = json.loads(function_call.arguments)
                tool_response = dispatch_tool(tool_name: str, tool_params: dict)
            """
            tool_response = ""

            if not gen_params.get("messages"):
                gen_params["messages"] = []

            gen_params["messages"].append(ChatMessage(
                role="assistant",
                content=output,
            ))
            gen_params["messages"].append(ChatMessage(
                role="function",
                name=function_call.name,
                content=tool_response,
            ))

            # Streaming output of results after function calls
            generate = predict(request.model, gen_params)
            return EventSourceResponse(generate, media_type="text/event-stream")

        else:
            # Handled to avoid exceptions in the above parsing function process.
            generate = parse_output_text(request.model, output)
            return EventSourceResponse(generate, media_type="text/event-stream")

    # Here is the handling of stream = False
    response = generate_chatglm3(model, tokenizer, gen_params)

    # Remove the first newline character
    if response["text"].startswith("\n"):
        response["text"] = response["text"][1:]
    response["text"] = response["text"].strip()
    usage = UsageInfo()
    function_call, finish_reason = None, "stop"
    if request.functions:
        try:
            function_call = process_response(response["text"], use_tool=True)
        except:
            logger.warning("Failed to parse tool call, maybe the response is not a tool call or have been answered.")

    if isinstance(function_call, dict):
        finish_reason = "function_call"
        function_call = FunctionCallResponse(**function_call)

    message = ChatMessage(
        role="assistant",
        content=response["text"],
        function_call=function_call if isinstance(function_call, FunctionCallResponse) else None,
    )

    logger.debug(f"==== message ====\n{message}")

    choice_data = ChatCompletionResponseChoice(
        index=0,
        message=message,
        finish_reason=finish_reason,
    )
    task_usage = UsageInfo.model_validate(response["usage"])
    for usage_key, usage_value in task_usage.model_dump().items():
        setattr(usage, usage_key, getattr(usage, usage_key) + usage_value)
    return ChatCompletionResponse(model=request.model, choices=[choice_data], object="chat.completion", usage=usage)


async def predict(model_id: str, params: dict):
    global model, tokenizer

    choice_data = ChatCompletionResponseStreamChoice(
        index=0,
        delta=DeltaMessage(role="assistant"),
        finish_reason=None
    )
    chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
    yield "{}".format(chunk.model_dump_json(exclude_unset=True))

    previous_text = ""
    for new_response in generate_stream_chatglm3(model, tokenizer, params):
        decoded_unicode = new_response["text"]
        delta_text = decoded_unicode[len(previous_text):]
        previous_text = decoded_unicode

        finish_reason = new_response["finish_reason"]
        if len(delta_text) == 0 and finish_reason != "function_call":
            continue

        function_call = None
        if finish_reason == "function_call":
            try:
                function_call = process_response(decoded_unicode, use_tool=True)
            except:
                logger.warning(
                    "Failed to parse tool call, maybe the response is not a tool call or have been answered.")

        if isinstance(function_call, dict):
            function_call = FunctionCallResponse(**function_call)

        delta = DeltaMessage(
            content=delta_text,
            role="assistant",
            function_call=function_call if isinstance(function_call, FunctionCallResponse) else None,
        )

        choice_data = ChatCompletionResponseStreamChoice(
            index=0,
            delta=delta,
            finish_reason=finish_reason
        )
        chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
        yield "{}".format(chunk.model_dump_json(exclude_unset=True))

    choice_data = ChatCompletionResponseStreamChoice(
        index=0,
        delta=DeltaMessage(),
        finish_reason="stop"
    )
    chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
    yield "{}".format(chunk.model_dump_json(exclude_unset=True))
    yield '[DONE]'


def predict_stream(model_id, gen_params):
    """
    The function call is compatible with stream mode output.

    The first seven characters are determined.
    If not a function call, the stream output is directly generated.
    Otherwise, the complete character content of the function call is returned.

    :param model_id:
    :param gen_params:
    :return:
    """
    output = ""
    is_function_call = False
    has_send_first_chunk = False
    for new_response in generate_stream_chatglm3(model, tokenizer, gen_params):
        decoded_unicode = new_response["text"]
        delta_text = decoded_unicode[len(output):]
        output = decoded_unicode

        # When it is not a function call and the character length is> 7,
        # try to judge whether it is a function call according to the special function prefix
        if not is_function_call and len(output) > 7:

            # Determine whether a function is called
            is_function_call = contains_custom_function(output)
            if is_function_call:
                continue

            # Non-function call, direct stream output
            finish_reason = new_response["finish_reason"]

            # Send an empty string first to avoid truncation by subsequent next() operations.
            if not has_send_first_chunk:
                message = DeltaMessage(
                    content="",
                    role="assistant",
                    function_call=None,
                )
                choice_data = ChatCompletionResponseStreamChoice(
                    index=0,
                    delta=message,
                    finish_reason=finish_reason
                )
                chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
                yield "{}".format(chunk.model_dump_json(exclude_unset=True))

            send_msg = delta_text if has_send_first_chunk else output
            has_send_first_chunk = True
            message = DeltaMessage(
                content=send_msg,
                role="assistant",
                function_call=None,
            )
            choice_data = ChatCompletionResponseStreamChoice(
                index=0,
                delta=message,
                finish_reason=finish_reason
            )
            chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
            yield "{}".format(chunk.model_dump_json(exclude_unset=True))

    if is_function_call:
        yield output
    else:
        yield '[DONE]'


async def parse_output_text(model_id: str, value: str):
    """
    Directly output the text content of value

    :param model_id:
    :param value:
    :return:
    """
    choice_data = ChatCompletionResponseStreamChoice(
        index=0,
        delta=DeltaMessage(role="assistant", content=value),
        finish_reason=None
    )
    chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
    yield "{}".format(chunk.model_dump_json(exclude_unset=True))

    choice_data = ChatCompletionResponseStreamChoice(
        index=0,
        delta=DeltaMessage(),
        finish_reason="stop"
    )
    chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
    yield "{}".format(chunk.model_dump_json(exclude_unset=True))
    yield '[DONE]'


def contains_custom_function(value: str) -> bool:
    """
    Determine whether 'function_call' according to a special function prefix.

    For example, the functions defined in "tool_using/tool_register.py" are all "get_xxx" and start with "get_"

    [Note] This is not a rigorous judgment method, only for reference.

    :param value:
    :return:
    """
    return value and 'get_' in value


if __name__ == "__main__":

    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, trust_remote_code=True)
    model = AutoModel.from_pretrained(MODEL_PATH, trust_remote_code=True)  
    if torch.cuda.is_available():
        total_vram_in_gb = get_device_properties(0).total_memory / 1073741824
        print(f'\033[32m显存大小: {total_vram_in_gb:.2f} GB\033[0m')
        with torch.cuda.device(f'cuda:{0}'):
            torch.cuda.empty_cache()
            torch.cuda.ipc_collect()
        if total_vram_in_gb > 13:
            model = model.half().cuda()
            print(f'\033[32m使用显卡fp16精度运行\033[0m')
        elif total_vram_in_gb > 10:
            model = model.half().quantize(8).cuda()
            print(f'\033[32m使用显卡int8量化运行\033[0m')
        elif total_vram_in_gb > 4.5:
            model = model.half().quantize(4).cuda()
            print(f'\033[32m使用显卡int4量化运行\033[0m')
        else:
            model = model.float()
            print('\033[32m使用cpu运行\033[0m')
    else:
        model = model.float()
        print('\033[32m使用cpu运行\033[0m')
    model = model.eval()

#bilibili@十字鱼 https://space.bilibili.com/893892 感谢参考——秋葉aaaki、大江户战士

    uvicorn.run(app, host='0.0.0.0', port=8000, workers=1)

2.部署One-Api

用于调用各种模型的节点，技术文档建议docker部署，可以用ubuntu20.04，windows程序里开启虚拟化。这里用VirtualBox，开启VT-x/AMD-V，需要在BIOS开启虚拟化功能，有些主板在安全设置里。网络端口转发添加3000、13000等，看需要增加规则。

打开ubuntu20，更新software可能需要一些时间，安装Code，Terminator用于之后的操作。首先解决权限问题。docker及docker守护程序的检查会涉及到权限问题。可将用户名添加到docker组，建议使用管理员权限操作。

sudo usermod -aG docker 用户名

打开code，新建terminal，拉取one-api的镜像，端口为13000

docker run --name one-api -d --restart always -p 13000:3000 -e TZ=Asia/Shanghai -v /home/ubuntu/data/one-api:/data justsong/one-api

进入localhost:13000，登录root，密码123456

chatglm3的Base URL：http//localhost:8000

继续添加m3e渠道，Base URL：http://localhost:6200

添加新令牌，提交。复制箭头下第一个到txt黏贴，例如：https://chat.oneapi.pro/#/?settings={"key":"sk-fAAfFClsyVXxvAgp57Ab758260124a958aF00a2d49CcB625","url":"http://localhost:3000"}

用docker部署m3e模型，默认用CPU运行：
docker run -d -p 6200:6008 --name=m3e-large-api registry.cn-hangzhou.aliyuncs.com/fastgpt_docker/m3e-large-api:latest
使用GPU运行：
docker run -d -p 6200:6008 --gpus all --name=m3e-large-api registry.cn-hangzhou.aliyuncs.com/fastgpt_docker/m3e-large-api:latest
原镜像：
docker run -d -p 6200:6008 --name=m3e-large-api stawky/m3e-large-api:latest

成功运行后测试，会反馈一组嵌入向量数据，说明成功部署

curl --location --request POST 'http://localhost:6200/v1/embeddings' \
--header 'Authorization: Bearer sk-aaabbbcccdddeeefffggghhhiiijjjkkk' \
--header 'Content-Type: application/json' \
--data-raw '{
    "model": "m3e",
    "input": ["laf是什么"]
}'

3.部署FastGPT

FastGPT也是Linux部署，这里就用Ubuntu20，打开Code，新建Terminal

下载docker-compose文件：


curl -O https://raw.githubusercontent.com/labring/FastGPT/main/files/deploy/fastgpt/docker-compose.yml

下载config文件：


curl -O https://raw.githubusercontent.com/labring/FastGPT/main/files/deploy/fastgpt/docker-compose.yml

拉取镜像：docker-compose pull

在后台运行容器：docker-compose up -d

FastGPT 4.6.8后mango副本集需要手动初始化操作

# 查看 mongo 容器是否正常运行
docker ps
# 进入容器
docker exec -it mongo bash

# 连接数据库
mongo -u myname -p mypassword --authenticationDatabase admin

# 初始化副本集。如果需要外网访问，mongo:27017 可以改成 ip:27017。但是需要同时修改 FastGPT 连接的参数（MONGODB_URI=mongodb://myname:mypassword@mongo:27017/fastgpt?authSource=admin => MONGODB_URI=mongodb://myname:mypassword@ip:27017/fastgpt?authSource=admin）
rs.initiate({
  _id: "rs0",
  members: [
    { _id: 0, host: "mongo:27017" }
  ]
})
# 检查状态。如果提示 rs0 状态，则代表运行成功
rs.status()

docker-compose文件修改OPENAI_BASE_URL：http://localhost:13000/v1

连接到One-API的端口，localhost改为本地地址

docker-compose文件修改CHAT_API_KEY：填入从OneAPI令牌复制的key

config文件修改，直接复制

{
  "systemEnv": {
    "openapiPrefix": "fastgpt",
    "vectorMaxProcess": 15,
    "qaMaxProcess": 15,
    "pgHNSWEfSearch": 100
  },
  "llmModels": [
    {
      "model": "chatglm3",
      "name": "chatglm3",
      "maxContext": 4000,
      "maxResponse": 4000,
      "quoteMaxToken": 2000,
      "maxTemperature": 1,
      "vision": false,
      "defaultSystemChatPrompt": ""
      },
    {
      "model": "gpt-3.5-turbo-1106", 
      "name": "gpt-3.5-turbo", 
      "maxContext": 16000, 
      "maxResponse": 4000, 
      "quoteMaxToken": 13000, 
      "maxTemperature": 1.2, 
      "inputPrice": 0,
      "outputPrice": 0,
      "censor": false,
      "vision": false, 
      "datasetProcess": false, 
      "toolChoice": true, 
      "functionCall": false, 
      "customCQPrompt": "", 
      "customExtractPrompt": "", 
      "defaultSystemChatPrompt": "", 
      "defaultConfig":{}  
    },
    {
      "model": "gpt-3.5-turbo-16k",
      "name": "gpt-3.5-turbo-16k",
      "maxContext": 16000,
      "maxResponse": 16000,
      "quoteMaxToken": 13000,
      "maxTemperature": 1.2,
      "inputPrice": 0,
      "outputPrice": 0,
      "censor": false,
      "vision": false,
      "datasetProcess": true,
      "toolChoice": true,
      "functionCall": false,
      "customCQPrompt": "",
      "customExtractPrompt": "",
      "defaultSystemChatPrompt": "",
      "defaultConfig":{}
    },
    {
      "model": "gpt-4-0125-preview",
      "name": "gpt-4-turbo",
      "maxContext": 125000,
      "maxResponse": 4000,
      "quoteMaxToken": 100000,
      "maxTemperature": 1.2,
      "inputPrice": 0,
      "outputPrice": 0,
      "censor": false,
      "vision": false,
      "datasetProcess": false,
      "toolChoice": true,
      "functionCall": false,
      "customCQPrompt": "",
      "customExtractPrompt": "",
      "defaultSystemChatPrompt": "",
      "defaultConfig":{}
    },
    {
      "model": "gpt-4-vision-preview",
      "name": "gpt-4-vision",
      "maxContext": 128000,
      "maxResponse": 4000,
      "quoteMaxToken": 100000,
      "maxTemperature": 1.2,
      "inputPrice": 0,
      "outputPrice": 0,
      "censor": false,
      "vision": false,
      "datasetProcess": false,
      "toolChoice": true,
      "functionCall": false,
      "customCQPrompt": "",
      "customExtractPrompt": "",
      "defaultSystemChatPrompt": "",
      "defaultConfig":{}
    }
  ],
  "vectorModels": [
    {
      "model": "m3e",
      "name": "m3e",
      "price": 0.1,
      "defaultToken": 500,
      "maxToken": 1800
      },
    {
      "model": "text-embedding-ada-002",
      "name": "Embedding-2",
      "inputPrice": 0,
      "outputPrice": 0,
      "defaultToken": 700,
      "maxToken": 3000,
      "weight": 100,
      "defaultConfig":{}  
    }
  ],
  "reRankModels": [],
  "audioSpeechModels": [
    {
      "model": "tts-1",
      "name": "OpenAI TTS1",
      "inputPrice": 0,
      "outputPrice": 0,
      "voices": [
        { "label": "Alloy", "value": "alloy", "bufferId": "openai-Alloy" },
        { "label": "Echo", "value": "echo", "bufferId": "openai-Echo" },
        { "label": "Fable", "value": "fable", "bufferId": "openai-Fable" },
        { "label": "Onyx", "value": "onyx", "bufferId": "openai-Onyx" },
        { "label": "Nova", "value": "nova", "bufferId": "openai-Nova" },
        { "label": "Shimmer", "value": "shimmer", "bufferId": "openai-Shimmer" }
      ]
    }
  ],
  "whisperModel": {
    "model": "whisper-1",
    "name": "Whisper1",
    "inputPrice": 0,
    "outputPrice": 0
  }
}

如果所有步骤成功，访问localhost:3000端口，localhost记得改为本机地址，进入FastGPT页面。本地模型chatglm3的API需要开启。