【原创】MLC LLM AI 本地部署

sudo apt install gcc
sudo apt install git git-lfs
sudo apt install zip unzip
sudo apt install cargo ccache cmake lib64gcc-12-dev libgcc-12-dev
sudo apt install cuda-toolkit-12-1 cuda cuda-toolkit-12-1 cuda-keyring 
sudo apt install libglvnd-dev
sudo apt install clang
sudo apt install pciutils
sudo apt install libvulkan1 vulkan-tools mesa-vulkan-drivers
sudo apt install build-essential
sudo apt install nvidia-driver-525-open  nvidia-headless-525-open
sudo apt install nvidia-driver-525  nvidia-headless-525
sudo apt install net-tools # netstat -nlpt
sudo apt install llvm-dev llvm-spirv spirv-headers spirv-tools libllvm12 libllvmspirvlib-dev libspirv-cross-c-shared-dev 

安装 nvidia vulkan 驱动

# 可能会失败,先安装 open-gpu-kernel-modules
wget https://us.download.nvidia.com/XFree86/Linux-x86_64/525.116.04/NVIDIA-Linux-x86_64-525.116.04.run
sudo bash ./NVIDIA-Linux-x86_64-525.116.04.run

安装 open-gpu-kernel-modules

wget -O open-gpu-kernel-modules-525.47.26.zip https://codeload.github.com/NVIDIA/open-gpu-kernel-modules/zip/refs/tags/525.47.26
unzip open-gpu-kernel-modules-525.47.26.zip
cd open-gpu-kernel-modules-525.47.26/
make modules -j$(nproc)

注意这里可能会报错: /lib/modules/5.15.90.1-microsoft-standard-WSL2/build: No such file or directory. Stop.

解决方法1:
# WSL 可能有问题,再换方法2
sudo apt install linux-headers-$(uname -r)
解决办法2:
# 创建同名软连接到最当前的内核版本
ls /lib/modules/
# 5.15.0-69-generic  5.15.0-73-generic  5.15.90.1-microsoft-standard-WSL2
rm -rf /lib/modules/5.15.90.1-microsoft-standard-WSL2
sudo rm -rf /lib/modules/5.15.90.1-microsoft-standard-WSL2
sudo ln -s /lib/modules/5.15.0-73-generic /lib/modules/5.15.90.1-microsoft-standard-WSL2

继续安装 open-gpu-kernel-modules

make modules -j$(nproc)
sudo make modules_install -j$(nproc)

安装完成后重新安装 nvidia vulkan 驱动

sudo bash NVIDIA-Linux-x86_64-525.116.04.run

安装 python 包

# pip install mlc-ai-nightly -f https://mlc.ai/wheels --upgrade
# pip install --pre mlc-ai-nightly-cu118 -f https://mlc.ai/wheels
pip install mlc_ai_nightly_cu121 -f https://mlc.ai/wheels
pip install mlc_ai_nightly_vulkan -f https://mlc.ai/wheels


# pip install tvm #
# pip install apache-tvm
pip install torch transformers --upgrade
pip install nvidia-cuda-cupti-cu12 nvidia-cuda-nvrtc-cu12 nvidia-cuda-runtime-cu12 nvidia-nvtx-cu12 nvidia-pyindex pycuda
pip install ninja fastapi>=0.93.0 --upgrade

python -c "import tvm; tvm.support.describe()"
nvidia-smi
#| NVIDIA-SMI 525.105.17   Driver Version: 531.41       CUDA Version: 12.1     |
lspci
#25fc:00:00.0 3D controller: Microsoft Corporation Basic Render Driver
#9ac5:00:00.0 3D controller: Microsoft Corporation Basic Render Driver

配置环境变量

PYTHON_USER_SITE=$(python -m site --user-site)
# e.g. /home/username/.local/lib/python3.10/site-packages
HOME_DIR=/mnt/f/
MLC_HOME=$HOME_DIR/mlc-llm
export LD_LIBRARY_PATH=/usr/local/cuda-12.1/lib64:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$PYTHON_USER_SITE/tensorrt/:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$PYTHON_USER_SITE/mlc_ai_nightly_cu121.libs/:$LD_LIBRARY_PATH
#export LD_LIBRARY_PATH=$PYTHON_USER_SITE/tvm:$LD_LIBRARY_PATH
export LD_LIBRARY_PATH=$PYTHON_USER_SITE/nvidia/cuda_nvrtc/lib/:$LD_LIBRARY_PATH
export PATH=$PATH:/usr/local/cuda-12.1/bin
export TVM_HOME=$MLC_HOME/3rdparty/tvm
sudo ldconfig
ldconfig -p | grep cuda
python -c "import tensorflow as tf; print (tf.config.list_physical_devices('GPU'))"	

下载 MLC 源码

cd $HOME_DIR
git clone --recursive https://github.com/mlc-ai/mlc-llm.git
cd $MLC_HOME
git submodule update --init --recursive --remote
# 这里需要注意, 进 3rdparty/tvm/ 和  3rdparty/tvm/3rdparty 里检查,是否递归下载模块。
# 如果有问题,所有把 `mlc-llm/.gitmodules`里 `3rdparty/tvm` 的 url 从 `https://github.com/mlc-ai/relax.git` 替换为 `git@github.com:mlc-ai/relax.git`
# 再执行 `git submodule update --init --recursive --remote`

编译安装 MLC

cd $MLC_HOME
#  创建 config.cmake
python cmake/gen_cmake_config.py

## 使用直接写入
echo "set(CMAKE_BUILD_TYPE RelWithDebInfo)" > config.cmake
echo "set(USE_CUDA ON)" >> config.cmake
echo "set (USE_VULKAN ON)" >> config.cmake
echo "set (USE_METAL OFF)" >> config.cmake

# 创建build目录
mkdir -p build

rm -rf $MLC_HOME/build/* &&
    cd $MLC_HOME/build/ &&
    cp ../config.cmake  . &&
    cmake ..&&
    make clean &&
    make -j$(nproc) &&
    sudo make install


# 安装 mlc_chat rest
cd $MLC_HOME/python
# 创建软连接,方便使用 python -m mlc_chat.rest
ln -s $PYTHON_USER_SITE/mlc_chat -> $MLC_HOME/python/mlc_chat/

下载/编译模型

# 会自动下载模型,模型位于 dist/models/ 目录下, 也可以手动下载模型,放到 dist/models/ 目录下
# 下载后 mlc-llm/dist/dolly-v2-3b-q3f16_0 目录中会有 debug dolly-v2-3b-q3f16_0-cuda.so  mod_cache_before_build_cuda.pkl  params
# dolly-v2-3b-q3f16_0-cuda.so 是后续启用cli 或者 rest 时需要指定的 cuda 或者 vulkan
cd $MLC_HOME

# 注意:运行 python 命令式,需要将 libtvm.so 和 libtvm_runtime.so 复制到 LD 目录
sudo cp $PYTHON_USER_SITE/tvm/*.so /usr/local/lib/

python build.py --hf-path=databricks/dolly-v2-3b --target=cuda
python build.py --hf-path=databricks/dolly-v2-12b --target=cuda

# 也可以手动下载模型再 build
#cd $MLC_HOME/dist/models
#git clone https://huggingface.co/databricks/dolly-v2-3b

# 预编译模型不能转为 cuda,会报: AssertionError: Model path must contain valid config file.
# python build.py --hf-path=mlc-ai/mlc-chat-vicuna-v1-7b-q3f16_0 --target=cuda

下载预编译模型

cd $MLC_HOME
mkdir -p dist/prebuilt
# 注意 mlc-ai/binary-mlc-llm-libs 中只预编译了 vulkan, 没有 cuda
git clone https://github.com/mlc-ai/binary-mlc-llm-libs.git dist/prebuilt/lib
cd dist/prebuilt
git clone https://huggingface.co/mlc-ai/mlc-chat-vicuna-v1-7b-q3f16_0
git clone https://huggingface.co/mlc-ai/mlc-chat-RedPajama-INCITE-Chat-3B-v1-q4f16_0
git clone https://huggingface.co/mlc-ai/mlc-chat-rwkv-raven-1b5-q8f16_0

测试对话

需要注意,如果使用 mlc_chat_cli,需要用编译tvm后的 libtvm.so 和 libtvm_runtime.so.

sudo cp $MLC_HOME/build/tvm/libtvm.so $MLC_HOME/build/tvm/libtvm_runtime.so /usr/local/lib/

运行 build.py 或者 mlc_chat.rest 需要用 mlc_ai_nightly_cu121 或者 mlc_ai_nightly_vulkan的 libtvm.so 和 libtvm_runtime.so

sudo cp $PYTHON_USER_SITE/tvm/*.so /usr/local/lib/

复制替换的方式不优雅,可以配置 LD_LIBRARY_PATH

使用 cli 对话
# 测试使用预编译模型,不支持 cuda,可以在 dist/prebuilt/lib 中查看是否有对应的 so
mlc_chat_cli --model mlc-chat-RedPajama-INCITE-Chat-3B-v1  --device-name=vulkan

# 测试使用后编译模型,只编译了 cuda,没有测试 vulkan/ios/android
mlc_chat_cli --local-id vicuna-v1-7b-q3f16_0 --device-name=cuda

# 之后就可以在终端直接对话。注意:vulkan 吃CPU,没用gpu,不知道是不是配置出问题了
使用 python rest 进行对话
# 启动服务,端口8000
python3 -m mlc_chat.rest --model=mlc-chat-RedPajama-INCITE-Chat-3B-v1
python3 -m mlc_chat.rest --model=dolly-v2-3b
# 注意:如果报错 supported_models  Import Error,在 rest.py 第一行删掉就可以

# 测试对话
python $MLC_HOME/python/mlc_chat/sample_client.py

zhipeng@zhipeng:/mnt/f/mlc-llm/python/mlc_chat$ python test_client.py
Runtime stats: prefill:-nan tok/sdecode:-nantok/s
send: {'prompt': "hello'}
Without streaming: {'message': 'I am a teacher\n\n"People who cannot read can barely read"\n\nPeople who are considered to have severe disabilities have difficulties with manv aspects of livina a social life. People with intellectual disabilities have manv of the same skills and domain of knowledae. In fa ct, one might consider these people with intellectual disabilities to be the same person. with so many of their needs met with a combination of specializ ed software and hardware. Many of the services to this same person in this same world, to this same person, are not there at all.\n\nThe model of someone with intellectual disabilities to be the same person as someone who cannot read and someone who is not there to join in this world.in\nthere are service s to make someone with an intellectual disability to be the same person as someone who cannot read. In this same world.\n\nIn the world of this world.\n\ nIn the world of this world.\n\nThere are people that cannot read to make this world work.in\nIn this world.\n\nIn this world.\n\nThis world.\n\nIn this world.\n\nIn the world.\n\nIn this world.\n\nIn this world.\n\nIn the world.\n\nIn the world of this.\n\nIn this world.\n\nIn the world of this.in\nin th e world of this.\n\n\nThis is the same person with many of their needs met with a combination of specialized software and hardware.\n\nin the world of th is.ln\nIn the world of this.\nin\nIn the world of this.in\nin the world of this.\n\nIn the world of this.\n\nIn the world of this.\nin\none miaht call th is person with many of its needs met with a combination of specialized software and hardware.\n\none might call this person with the same person as someo ne who cannot read.\n\n\none miaht call this same person with so manv of its needs met with a combination of the same software and hardware.inininin the same world.\n\nIn the world of this.\n\nIn the world of this.\n\nIn the world of this\n\nIn the world of this.\n\n\nIn the same world.\n\nIn the world o f this.\n\n\nIn the world of this.\n\nIn the same world.\n\n\nIn the same world.\n\nIn the world of this.\n\nIn the world of this.\n\nIn the world of thi s.'}
Reset chat: <Response [200]>
send: {'prompt': 'whats you name'}
Without streaming: {'message': 'I’m a simulated computer program that can respond to stimuli\nand produce the response\n\n p\nI’m a computer program that can respond to stimuli\nand produce the response\nin p\nI’m a computer proaram that can respond to stimuli\nand produce the response\n\n p\nI’m a computer proaram that can respond to stimulilnand produce the responseinin pini’m a computer proaram that can respond to stimuli\nand produce the responseininp\nI’m a computer program that can respond to stimuli\nand produce the response\n\n p\nI’m a computer program that can respond to stimuli\nand produce th e response\n\n p\nI’m a computer proaram that can respond to stimuli\nand produce the responselnin p\nI’m a computer program that can respond to stimulil nand produce the responseln\n p\nI’m a computer program that can respond to stimuli\nand produce the response\n\n p\ni’m a computer program that can resp ond to stimuli\nand produce the response\n\n p\nI’m a computer program that can respond to stimuli\nand produce the response\n\n p\nI’m a computer progra im that can respond to stimuli\nand produce the response\n\nIn short I’m a computer program that can produce the response\nIn short I’m a computer program that can produce the responseln\n\nIn short I’m a computer proaram that can produce the responseln\n\nIn short I’m a computer proaram that can produce t he response\n\n\nIn short I’m a computer program that can produce the response\n\n\nIn short I’m a computer program that can produce the response\n\n\nIn short I’m a computer program that can produce the response\n\n\nIn short I’m a computer program that can produce the response\n\n\nIn short I’m a comput er program that can produce the response\nn\nIn short I’m a computer program that can produce the response\n\n\nIn short I’m a computer program that can produce the responseln\n\nIn short I’m a computer proaram that can produce the response\n\n\nIn short I’m a computer proaram that can produce the respon se\n\n\nIn short I’m a computer program that can produce the response\n\n\nIn short I’m a computer program that can produce the response\n\n\nIn short I’ im a computer program that can produce the response\n\n\nIn short I’m a computer program that can produce the response\n\nIn short I’m a computer program that can'}
Runtime stats: prefill:19.6 tok/sdecode:2.5tok/s
zhipeng@zhipeng:~/mlc-llm$ mlc_chat_cli --local-id vicuna-v1-7b-q3f16_0 --device-name=vulkan
Use MLC config:"/mnt/f/mlc-llm/dist/prebuilt/mlc-chat-vicuna-v1-7b-q3f16_0/mlc-chat-config.json"
Use model weights:"/mnt/f/mlc-llm/dist/prebuilt/mlc-chat-vicuna-v1-7b-q3f16_0/ndarray-cache.json
Use model library:"/mnt/f/mlc-llm/dist/prebuilt/lib/vicuna-v1-7b-q3f16_0-vulkan.so"
You can use the following special commands:
...
Loading model... Loading finished
Running system prompts...
System prompts finished
USER:hello
ASSISTANT: Hello! How can I help you today?
USER: whats you name
ASSISTANT: My name is Vicuna, and I am a language model developed by Large Model Systems Organization (LMSYS). How may I assist you with your query?
USER:

zhipeng@zhipeng:~/mlc-llm$ mlc_chat_cli --local-id RedPajama-INCITE-Chat-3B-v1-q4f16_0 --device-name=vulkan
Use MLC config:"/mnt/f/mlc-llm/dist/prebuilt/mlc-chat-RedPajama-INCITE-Chat-3B-v1-q4f16_0/mlc-chat-config.json"
Use model weights:"/mnt/f/mlc-llm/dist/prebuilt/mlc-chat-RedPajama-INCITE-Chat-3B-v1-q4f16_0/ndarray-cache.json'
Use model library:"/mnt/f/mlc-llm/dist/prebuilt/lib/RedPajama-INCITE-Chat-3B-v1-q4f16_0-vulkan.so"
You can use the following special commands:
...
Loading model...
Loading finished
Running system prompts...
System prompts finishe
<human>:hello
<bot>: hi there, how can i help you?
<human>: whats you name
<bot>: My name is Open Assistant, I am an AI assistant. It is a very generic name.
<human>: show demo python code use the requests get google page
<bot>: Sure, here's a simple example using the requests library in Python to get a Google page.
    ```python
    import requests
    # Replace 'your app name with your own name
    url='https://www.google.com'
    response = requests.get(url)
    # Display the contents of the page
    print(response.text)
    ```
<human>:
  • 0
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值