使用 vllm 本地部署 Qwen2-7B-Instruct。
1.安装vllm
创建虚拟环境
conda create -n myvllm python=3.11 -y
conda activate myvllm
pip install -U pip
安装 Ray 和 Vllm,
pip install ray
pip install vllm # 或者
git clone https://github.com/vllm-project/vllm.git; cd vllm
pip install ./ --no-build-isolation
安装 flash-attention,
# git clone https://github.com/Dao-AILab/flash-attention;cd flash-attention
# pip install ./ --no-build-isolation
pip install flash-attn
- 本地部署 Qwen2-7B-Instruct
eval "$(conda shell.bash hook)"
conda activate myvllm
CUDA_VISIBLE_DEVICES=3,2,1,0
python -m vllm.entrypoints.openai.api_server --trust-remote-code --served-model-name gpt-4 --model Qwen/Qwen2-7B-Instruct --gpu-memory-utilization 0.98 --tensor-parallel-size 4 --port 8000
- 使用 Qwen2-7B-Instruct
使用llama调用 python llama
from langchain.llms import LlamaCpp
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
template = """Question: {question}
Answer: Let's work this out in a step by step way to be sure we have the right answer."""
prompt = PromptTemplate(template=template, input_variables=["question"])
import sys
model_name = sys.argv[1]
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
# Make sure the model path is correct for your system!
llm = LlamaCpp(
model_path=model_name,
temperature=0.75,
max_tokens=2000,
top_p=1,
callback_manager=callback_manager,
verbose=True, # Verbose is required to pass to the callback manager
)
while True:
inputs = input("输入:")