虚拟桌宠模拟器(VPet)使用ChatGLM替代ChatGPT

人工智能护发素

已于 2023-08-24 17:04:57 修改

阅读量3.1k

点赞数 6

文章标签： chatgpt

于 2023-08-21 10:38:53 首次发布

本文链接：https://blog.csdn.net/qq_44525568/article/details/132403441

版权

开源工程

虚拟桌宠
 ChatGLM

特别感谢

zkz098大佬找到的BUG和给出的解决方案

1. 拉取代码

cd Project
git clone https://github.com/THUDM/ChatGLM2-6B.git

2. 下载模型实现

GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/THUDM/chatglm2-6b
# 这里下载的只是一个占位符，并不是真正的模型

3.下载模型文件

https://cloud.tsinghua.edu.cn/d/674208019e314311ab5c/?p=%2Fchatglm2-6b&mode=list
[模型文件]: https://cloud.tsinghua.edu.cn/d/674208019e314311ab5c/?p=%2Fchatglm2-6b&mode=list

下载这些文件，完成后替换掉步骤2时下载的模型实现

4. 修改openai_api.py

2023/8/20

由于openai_api中没有返回一个关键词：usage，会导致调用时出现报错

需要做如下修改

import time
import tiktoken
import torch
import uvicorn
from pydantic import BaseModel, Field
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from contextlib import asynccontextmanager
from typing import Any, Dict, List, Literal, Optional, Union
from transformers import AutoTokenizer, AutoModel
from sse_starlette.sse import ServerSentEvent, EventSourceResponse


@asynccontextmanager
async def lifespan(app: FastAPI):  # collects GPU memory
    yield
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()


app = FastAPI(lifespan=lifespan)

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)


class ModelCard(BaseModel):
    id: str
    object: str = "model"
    created: int = Field(default_factory=lambda: int(time.time()))
    owned_by: str = "owner"
    root: Optional[str] = None
    parent: Optional[str] = None
    permission: Optional[list] = None


class ModelList(BaseModel):
    object: str = "list"
    data: List[ModelCard] = []


class ChatMessage(BaseModel):
    role: Literal["user", "assistant", "system"]
    content: str


class DeltaMessage(BaseModel):
    role: Optional[Literal["user", "assistant", "system"]] = None
    content: Optional[str] = None


class ChatCompletionRequest(BaseModel):
    model: str
    messages: List[ChatMessage]
    temperature: Optional[float] = None
    top_p: Optional[float] = None
    max_length: Optional[int] = None
    stream: Optional[bool] = False


class ChatCompletionResponseChoice(BaseModel):
    index: int
    message: ChatMessage
    finish_reason: Literal["stop", "length"]


class ChatCompletionResponseStreamChoice(BaseModel):
    index: int
    delta: DeltaMessage
    finish_reason: Optional[Literal["stop", "length"]]


class ChatCompletionResponse(BaseModel):
    model: str
    object: Literal["chat.completion", "chat.completion.chunk"]
    choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]]
    created: Optional[int] = Field(default_factory=lambda: int(time.time()))
    usage: dict


@app.get("/v1/models", response_model=ModelList)
async def list_models():
    global model_args
    model_card = ModelCard(id="gpt-3.5-turbo")
    return ModelList(data=[model_card])


@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def create_chat_completion(request: ChatCompletionRequest):
    global model, tokenizer

    if request.messages[-1].role != "user":
        raise HTTPException(status_code=400, detail="Invalid request")
    query = request.messages[-1].content

    prev_messages = request.messages[:-1]
    if len(prev_messages) > 0 and prev_messages[0].role == "system":
        query = prev_messages.pop(0).content + query

    history = []
    if len(prev_messages) % 2 == 0:
        for i in range(0, len(prev_messages), 2):
            if prev_messages[i].role == "user" and prev_messages[i + 1].role == "assistant":
                history.append([prev_messages[i].content, prev_messages[i + 1].content])

    if request.stream:
        generate = predict(query, history, request.model)
        return EventSourceResponse(generate, media_type="text/event-stream")

    response, _ = model.chat(tokenizer, query, history=history)
    choice_data = ChatCompletionResponseChoice(
        index=0,
        message=ChatMessage(role="assistant", content=response),
        finish_reason="stop"
    )
    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
    pt = len(encoding.encode(query))
    rt = len(encoding.encode(response))
    usage_data = {
        "prompt_tokens": pt,
        "completion_tokens": rt,
        "total_tokens": pt + rt
    }
    return ChatCompletionResponse(model=request.model, choices=[choice_data], object="chat.completion",
                                  usage=usage_data)


async def predict(query: str, history: List[List[str]], model_id: str):
    global model, tokenizer

    choice_data = ChatCompletionResponseStreamChoice(
        index=0,
        delta=DeltaMessage(role="assistant"),
        finish_reason=None
    )
    chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
    yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False))

    current_length = 0

    for new_response, _ in model.stream_chat(tokenizer, query, history):
        if len(new_response) == current_length:
            continue

        new_text = new_response[current_length:]
        current_length = len(new_response)

        choice_data = ChatCompletionResponseStreamChoice(
            index=0,
            delta=DeltaMessage(content=new_text),
            finish_reason=None
        )
        chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
        yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False))

    choice_data = ChatCompletionResponseStreamChoice(
        index=0,
        delta=DeltaMessage(),
        finish_reason="stop"
    )
    chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
    yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False))
    yield '[DONE]'

if __name__ == "__main__":
    tokenizer = AutoTokenizer.from_pretrained("models/chatglm2-6b", trust_remote_code=True)
    model = AutoModel.from_pretrained("models/chatglm2-6b", trust_remote_code=True).quantize(8).half().cuda()
    # model = AutoModel.from_pretrained("models/chatglm2-6b", trust_remote_code=True).quantize(4).half().cuda()
    # model = AutoModel.from_pretrained("models/chatglm2-6b", trust_remote_code=True).half().cuda()
    # "models/chatglm2-6b"改为你存放模型的路径
    # 多显卡支持，使用下面两行代替上面一行，将num_gpus改为你实际的显卡数量
    # from utils import load_model_on_gpus
    # model = load_model_on_gpus("THUDM/chatglm2-6b", num_gpus=2)
    model.eval()

    uvicorn.run(app, host='0.0.0.0', port=8080, workers=1, root_path="/ChatGLM/OpenAPI")
	# port改为你自己设置的port,这里的设置需要和nginx中适配

5. 添加nginx配置

下载nginx（装过的略过）

apt install nginx -y
添加配置（根据自己实际情况来）

# /etc/nginx/nginx.conf -> 在http{}中添加如下内容
	include /etc/nginx/myHost/*.conf;
# 创建文件夹 /etc/nginx/myHost （根据自己实际情况来）
cd /etc/nginx
mkdir myHost
cd myHost

# 创建文件 /etc/nginx/myHost/ChatGLM.conf
touch ChatGLM.conf

# 在ChatGLM.conf中添加如下内容
server {
    listen 8080;
    server_name i-2.gpushare.com;  # Change this to your domain name

    location /ChatGLM/OpenAPI/ {  # Change this if you'd like to server your Gradio app on a different path
        proxy_pass http://0.0.0.0:18203/; # Change this if your Gradio app will be running on a different port
        proxy_redirect off;
        proxy_http_version 1.1;
        proxy_set_header Upgrade $http_upgrade;
        proxy_set_header Connection "upgrade";
        proxy_set_header Host $host;
    }
}

6. 配置python环境

conda create -n glm python=3.8 -y
conda activate glm
pip install -r requirements.txt
pip install tiktoken

7. 启动服务

nohup python openai_api.py

8. 将连接给到桌宠

修改桌宠的chatapi配置,将api-url改为你自己的url，key不用填