Fastapi部署llama
服务端代码
import uvicorn
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import AutoTokenizer, LlamaForCausalLM
import torch
app = FastAPI()
class Query(BaseModel):
text: str
device = torch.device("cuda:0")
model_path = 'llama-2-7b-chat-hf'
model = LlamaForCausalLM.from_pretrained(model_path, device_map=