手动执行脚本
python3 -m fastchat.serve.controller
CUDA_VISIBLE_DEVICES=0 python3 -m fastchat.serve.model_worker --model-path /home/NLP/LLM/pretrained_model/LanguageModels/ChatGLM2_6B --limit-worker-concurrency 100 --max-gpu-memory "44GiB" --port 31001 --worker http://localhost:31001 --load-8bit
CUDA_VISIBLE_DEVICES=0 python3 -m fastchat.serve.model_worker --model-path /home/NLP/LLM/pretrained_model/LanguageModels/ChatGLM2_6B --limit-worker-concurrency 100 --max-gpu-memory "44GiB" --port 31002 --worker http://localhost:31002 --load-8bit
CUDA_VISIBLE_DEVICES=1 python3 -m fastchat.serve.model_worker --model-path /home/NLP/LLM/pretrained_model/LanguageModels/ChatGLM2_6B --limit-worker-concurrency 100 --max-gpu-memory "44GiB" --port 31003 --worker http://localhost:31003 --load-8bit
CUDA_VISIBLE_DEVICES=1 python3 -m fastchat.serve.model_worker --model-path /home/NLP/LLM/pretrained_model/LanguageModels/ChatGLM2_6B --limit-worker-concurrency 100 --max-gpu-memory "44GiB" --port 31004 --worker http://localhost:31004 --load-8bit
python3 -m fastchat.serve.gradio_web_server --concurrency-count=150
脚本一键启动
import subprocess
import multiprocessing
import time
import fastchat.serve.gradio_web_server
def execute_command(command):
try:
subprocess.run(command, shell=True, check=True)
except subprocess.CalledProcessError as e:
print(f"Error executing command: {e}")
if __name__ == "__main__":
scripts = [
"python3 -m fastchat.serve.controller",
"CUDA_VISIBLE_DEVICES=0 python3 -m fastchat.serve.model_worker --model-path /home/NLP/LLM/pretrained_model/LanguageModels/ChatGLM2_6B --limit-worker-concurrency 100 --max-gpu-memory 44GiB --port 31001 --worker http://localhost:31001 --load-8bit",
"CUDA_VISIBLE_DEVICES=0 python3 -m fastchat.serve.model_worker --model-path /home/NLP/LLM/pretrained_model/LanguageModels/ChatGLM2_6B --limit-worker-concurrency 100 --max-gpu-memory 44GiB --port 31002 --worker http://localhost:31002 --load-8bit",
"CUDA_VISIBLE_DEVICES=1 python3 -m fastchat.serve.model_worker --model-path /home/NLP/LLM/pretrained_model/LanguageModels/ChatGLM2_6B --limit-worker-concurrency 100 --max-gpu-memory 44GiB --port 31003 --worker http://localhost:31003 --load-8bit",
"CUDA_VISIBLE_DEVICES=1 python3 -m fastchat.serve.model_worker --model-path /home/NLP/LLM/pretrained_model/LanguageModels/ChatGLM2_6B --limit-worker-concurrency 100 --max-gpu-memory 44GiB --port 31004 --worker http://localhost:31004 --load-8bit",
"python3 -m fastchat.serve.gradio_web_server --concurrency-count=150"
]
processes = []
start_time = 10
add_time = 5
for script in scripts:
p = multiprocessing.Process(target=execute_command, args=(script,))
processes.append(p)
for p in processes:
p.start()
time.sleep(start_time)
start_time += add_time
for p in processes:
p.join()