vLLM 用于 LLM 推理服务和客户端的访问方式 1 - openai
flyfish
# (Recommended) Create a new conda environment.
conda create -n myenv python=3.10 -y
conda activate myenv
pip install vllm
方便使用模型的方法
export VLLM_USE_MODELSCOPE=True
启动服务
vLLM提供一个 HTTP 服务器,该服务器实现了 OpenAI 的 Completions 和 Chat API。
vllm serve LLM-Research/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
产生的结果
(myenv) sisyphus@sisyphus-Super-Server:~/tool$ vllm serve LLM-Research/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
INFO 05-06 12:10:04 api_server.py:528] vLLM API server version 0.6.3.post1
INFO 05-06 12:10:04 api_server.py:529] args: Namespace(subparser='serve', model_tag='LLM-Research/Meta-Llama-3-8B-Instruct', config='', host=None, port=8000, uvicorn_log_level='info', allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key='token-abc123', lora_modules=None, prompt_adapters=None, chat_template=None, response_role='assistant', ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, ssl_cert_reqs=0, root_path=None, middleware=[], return_tokens_as_token_ids=False, disable_frontend_multiprocessing=False, enable_auto_tool_choice=False, tool_call_parser=None, tool_parser_plugin='', model='LLM-Research/Meta-Llama-3-8B-Instruct', tokenizer=None, skip_tokenizer_init=False, revision=None, code_revision=None, tokenizer_revision=None, tokenizer_mode='auto', trust_remote_code=False, download_dir=None, load_format='auto', config_format=<ConfigFormat.AUTO: 'auto'>, dtype='auto', kv_cache_dtype='auto', quantization_param_path=None, max_model_len=None, guided_decoding_backend='outlines', distributed_executor_backend=None, worker_use_ray=False, pipeline_parallel_size=1, tensor_parallel_size=1, max_parallel_loading_workers=None, ray_workers_use_nsight=False, block_size=16, enable_prefix_caching=False, disable_sliding_window=False, use_v2_block_manager=False, num_lookahead_slots=0, seed=0, swap_space=4, cpu_offload_gb=0, gpu_memory_utilization=0.9, num_gpu_blocks_override=None, max_num_batched_tokens=None, max_num_seqs=256, max_logprobs=20, disable_log_stats=False, quantization=None, rope_scaling=None, rope_theta=None, enforce_eager=False, max_context_len_to_capture=None, max_seq_len_to_capture=8192, disable_custom_all_reduce=False, tokenizer_pool_size=0, tokenizer_pool_type='ray', tokenizer_pool_extra_config=None, limit_mm_per_prompt=None, mm_processor_kwargs=None, enable_lora=False, max_loras=1, max_lora_rank=16, lora_extra_vocab_size=256, lora_dtype='auto', long_lora_scaling_factors=None, max_cpu_loras=None, fully_sharded_loras=False, enable_prompt_adapter=False, max_prompt_adapters=1, max_prompt_adapter_token=0, device='auto', num_scheduler_steps=1, multi_step_stream_outputs=True, scheduler_delay_factor=0.0, enable_chunked_prefill=None, speculative_model=None, speculative_model_quantization=None, num_speculative_tokens=None, speculative_disable_mqa_scorer=False, speculative_draft_tensor_parallel_size=None, speculative_max_model_len=None, speculative_disable_by_batch_size=None, ngram_prompt_lookup_max=None, ngram_prompt_lookup_min=None, spec_decoding_acceptance_method='rejection_sampler', typical_acceptance_sampler_posterior_threshold=None, typical_acceptance_sampler_posterior_alpha=None, disable_logprobs_during_spec_decoding=None, model_loader_extra_config=None, ignore_patterns=[], preemption_mode=None, served_model_name=None, qlora_adapter_name_or_path=None, otlp_traces_endpoint=None, collect_detailed_traces=None, disable_async_output_proc=False, override_neuron_config=None, scheduling_policy='fcfs', disable_log_requests=False, max_log_len=None, disable_fastapi_docs=False, dispatch_function=<function serve at 0x730f67179090>)
INFO 05-06 12:10:04 api_server.py:166] Multiprocessing frontend to use ipc:///tmp/0079ed20-507b-4cca-ad3b-0b004dd0a3ad for IPC Path.
INFO 05-06 12:10:04 api_server.py:179] Started engine process with PID 526791
WARNING 05-06 12:10:08 arg_utils.py:1019] [DEPRECATED] Block manager v1 has been removed, and setting --use-v2-block-manager to True or False has no effect on vLLM behavior. Please remove --use-v2-block-manager in your engine argument. If your use case is not supported by SelfAttnBlockSpaceManager (i.e. block manager v2), please file an issue with detailed information.
WARNING 05-06 12:10:12 arg_utils.py:1019] [DEPRECATED] Block manager v1 has been removed, and setting --use-v2-block-manager to True or False has no effect on vLLM behavior. Please remove --use-v2-block-manager in your engine argument. If your use case is not supported by SelfAttnBlockSpaceManager (i.e. block manager v2), please file an issue with detailed information.
INFO 05-06 12:10:12 llm_engine.py:237] Initializing an LLM engine (v0.6.3.post1) with config: model='LLM-Research/Meta-Llama-3-8B-Instruct', speculative_config=None, tokenizer='LLM-Research/Meta-Llama-3-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=LLM-Research/Meta-Llama-3-8B-Instruct, num_scheduler_steps=1, chunked_prefill_enabled=False multi_step_stream_outputs=True, enable_prefix_caching=False, use_async_output_proc=True, use_cached_outputs=True, mm_processor_kwargs=None)
INFO 05-06 12:10:25 model_runner.py:1056] Starting to load model LLM-Research/Meta-Llama-3-8B-Instruct...
Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 25% Completed | 1/4 [00:21<01:04, 21.35s/it]
Loading safetensors checkpoint shards: 50% Completed | 2/4 [00:26<00:24, 12.05s/it]
Loading safetensors checkpoint shards: 75% Completed | 3/4 [00:49<00:17, 17.08s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [01:12<00:00, 19.24s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [01:12<00:00, 18.13s/it]
INFO 05-06 12:11:39 model_runner.py:1067] Loading model weights took 14.9595 GB
INFO 05-06 12:11:40 gpu_executor.py:122] # GPU blocks: 13376, # CPU blocks: 2048
INFO 05-06 12:11:40 gpu_executor.py:126] Maximum concurrency for 8192 tokens per request: 26.12x
INFO 05-06 12:11:42 model_runner.py:1395] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 05-06 12:11:42 model_runner.py:1399] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 05-06 12:11:53 model_runner.py:1523] Graph capturing finished in 11 secs.
INFO 05-06 12:11:53 api_server.py:232] vLLM to use /tmp/tmp5taum8_b as PROMETHEUS_MULTIPROC_DIR
WARNING 05-06 12:11:53 serving_embedding.py:199] embedding_mode is False. Embedding API will not work.
INFO 05-06 12:11:53 launcher.py:19] Available routes are:
INFO 05-06 12:11:53 launcher.py:27] Route: /openapi.json, Methods: GET, HEAD
INFO 05-06 12:11:53 launcher.py:27] Route: /docs, Methods: GET, HEAD
INFO 05-06 12:11:53 launcher.py:27] Route: /docs/oauth2-redirect, Methods: GET, HEAD
INFO 05-06 12:11:53 launcher.py:27] Route: /redoc, Methods: GET, HEAD
INFO 05-06 12:11:53 launcher.py:27] Route: /health, Methods: GET
INFO 05-06 12:11:53 launcher.py:27] Route: /tokenize, Methods: POST
INFO 05-06 12:11:53 launcher.py:27] Route: /detokenize, Methods: POST
INFO 05-06 12:11:53 launcher.py:27] Route: /v1/models, Methods: GET
INFO 05-06 12:11:53 launcher.py:27] Route: /version, Methods: GET
INFO 05-06 12:11:53 launcher.py:27] Route: /v1/chat/completions, Methods: POST
INFO 05-06 12:11:53 launcher.py:27] Route: /v1/completions, Methods: POST
INFO 05-06 12:11:53 launcher.py:27] Route: /v1/embeddings, Methods: POST
INFO: Started server process [526727]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on socket ('0.0.0.0', 8000) (Press CTRL+C to quit)
INFO 05-06 12:12:03 metrics.py:349] Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 0 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 0.0%, CPU KV cache usage: 0.0%.
请求方式1
from openai import OpenAI
client = OpenAI(
base_url="http://localhost:8000/v1",
api_key="token-abc123",
)
completion = client.chat.completions.create(
model="LLM-Research/Meta-Llama-3-8B-Instruct",
messages=[
{"role": "user", "content": "Who are you?"}
]
)
print(completion.choices[0].message)
结果
I'm LLaMA, an AI assistant developed by Meta AI that can understand and respond to human input in a conversational manner. I'm not a human, but a computer program designed to simulate conversation, answer questions, and even generate text.
I'm trained on a massive dataset of text from the internet and can generate human-like responses to a wide range of topics and questions. My primary function is to assist and provide helpful information to users like you, whether it's answering a question, providing definitions, or simply chatting about a topic.
I don't have personal opinions or emotions, but I'm designed to be friendly and helpful. I can understand and respond to natural language inputs, and I'm constantly learning and improving my responses based on the interactions I have with users like you.
请求方式2
import asyncio
from openai import AsyncOpenAI
client = AsyncOpenAI(base_url="http://localhost:8000/v1", api_key="token-abc123",)
async def main():
stream = await client.chat.completions.create(
model="LLM-Research/Meta-Llama-3-8B-Instruct",
messages=[{"role": "user", "content": "Who are you?"}],
stream=True,
)
async for chunk in stream:
#print(chunk.choices[0].delta.content or "", end="")
print(chunk)
asyncio.run(main())
结果
ChatCompletionChunk(id='chat-id', choices=[Choice(delta=ChoiceDelta(content='', function_call=None, refusal=None, role='assistant', tool_calls=None), finish_reason=None, index=0, logprobs=None)], created=1729835342, model='LLM-Research/Meta-Llama-3-8B-Instruct', object='chat.completion.chunk', service_tier=None, system_fingerprint=None, usage=None)
......
ChatCompletionChunk(id='chat-id', choices=[Choice(delta=ChoiceDelta(content=' to', function_call=None, refusal=None, role=None, tool_calls=None), finish_reason=None, index=0, logprobs=None)], created=1729835342, model='LLM-Research/Meta-Llama-3-8B-Instruct', object='chat.completion.chunk', service_tier=None, system_fingerprint=None, usage=None)
ChatCompletionChunk(id='chat-id', choices=[Choice(delta=ChoiceDelta(content=' your', function_call=None, refusal=None, role=None, tool_calls=None), finish_reason=None, index=0, logprobs=None)], created=1729835342, model='LLM-Research/Meta-Llama-3-8B-Instruct', object='chat.completion.chunk', service_tier=None, system_fingerprint=None, usage=None)
ChatCompletionChunk(id='chat-id', choices=[Choice(delta=ChoiceDelta(content=' needs', function_call=None, refusal=None, role=None, tool_calls=None), finish_reason=None, index=0, logprobs=None)], created=1729835342, model='LLM-Research/Meta-Llama-3-8B-Instruct', object='chat.completion.chunk', service_tier=None, system_fingerprint=None, usage=None)
ChatCompletionChunk(id='chat-id', choices=[Choice(delta=ChoiceDelta(content='.', function_call=None, refusal=None, role=None, tool_calls=None), finish_reason=None, index=0, logprobs=None)], created=1729835342, model='LLM-Research/Meta-Llama-3-8B-Instruct', object='chat.completion.chunk', service_tier=None, system_fingerprint=None, usage=None)
ChatCompletionChunk(id='chat-id', choices=[Choice(delta=ChoiceDelta(content='', function_call=None, refusal=None, role=None, tool_calls=None), finish_reason='stop', index=0, logprobs=None, stop_reason=None)], created=1729835342, model='LLM-Research/Meta-Llama-3-8B-Instruct', object='chat.completion.chunk', service_tier=None, system_fingerprint=None, usage=None)
{
"id": "chat-id",
"choices": [
{
"delta": {
"content": " to"
},
"finish_reason": null,
"index": 0,
"logprobs": null
}
],
"created": 1729835342,
"model": "LLM-Research/Meta-Llama-3-8B-Instruct",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": null,
"usage": null
}