环境配置
gcc & make
apt-get install -y gcc
apt-get install -y make
NVIDIA驱动
wget https://us.download.nvidia.cn/XFree86/Linux-x86_64/535.54.03/NVIDIA-Linux-x86_64-535.54.03.run
chmod +x NVIDIA-Linux-x86_64-535.54.03.run
./NVIDIA-Linux-x86_64-535.54.03.run --silent --no-questions
CUDA安装
wget https://developer.download.nvidia.com/compute/cuda/12.2.0/local_installers/cuda_12.2.0_535.54.03_linux.run
sudo sh cuda_12.2.0_535.54.03_linux.run
验证CUDA是否安装成功
ls /usr/local/cuda
如果显示
bin EULA.txt gds-12.2 libnvvp nsight-systems-2023.2.3 README targets
compute-sanitizer extras include nsight-compute-2023.2.0 nvml share tools
DOCS gds lib64 nsightee_plugins nvvm src version.json
则说明已安装,接下来把路径添加进环境变量,由于刚安装完成添加进的是临时环境变量,所以很多情况下会发生开机后无法使用nvcc -V
或nvcc --version
指令查找CUDA是否安装的情况
vim ~/.bashrc
添加 CUDA 到 PATH 环境变量
export PATH=/usr/local/cuda/bin:$PATH
export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
使更改生效
source ~/.bashrc
再次检查 nvcc
nvcc -V
或
nvcc --version
nvidia-fabricmanager(A800以前的显卡不支持nvswitch,所以跳过此步骤)
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/nvidia-fabricmanager-535_535.54.03-1_amd64.deb
sudo dpkg -i ./nvidia-fabricmanager-535_535.54.03-1_amd64.deb
# 验证fabricmanager安装
nvidia-smi -pm 1
systemctl enable nvidia-fabricmanager
systemctl start nvidia-fabricmanager
systemctl status nvidia-fabricmanager
nvidia-smi topo -m
禁用更新
# 将后面的值全部修改为0,然后重启
vi /etc/apt/apt.conf.d/10periodic
vi /etc/apt/apt.conf.d/20auto-upgrades
pytorch安装
# 选择https://pytorch.org/
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple some-package
pip3 install torch torchvision torchaudio
# 验证
python3
Python 3.8.10 (default, Nov 22 2023, 10:22:35)
[GCC 9.4.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> import torch
>>> torch.cuda.is_available()
True
安装miniconda3
mkdir -p /useropt182/miniconda3
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /useropt182/miniconda3/miniconda.sh
bash /useropt182/miniconda3/miniconda.sh -b -u -p /useropt182/miniconda3
rm -rf /useropt182/miniconda3/miniconda.sh
/useropt182/miniconda3/bin/conda init bash
source ~/.bashrc
下载模型源码 & 安装模型依赖
创建虚拟环境,一个模型一个环境
conda create --prefix=/useropt182/ChatGLM3-6B/env python=3.11
conda activate /useropt182/ChatGLM3-6B/env
下载源码
sudo git clone https://github.com/THUDM/ChatGLM3.git
pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
pip3 install -r /useropt182/ChatGLM3/requirements.txt
pip3 install -r /useropt182/ChatGLM3/composite_demo/requirements.txt
下载模型
pip3 install 'huggingface_hub<0.22.0'
mirror
export HF_ENDPOINT=https://hf-mirror.com
单线程下载
export HF_HUB_ENABLE_HF_TRANSFER=0
#构建web可视化页面的工具
pip install streamlit
#多运行几次,确保不出现下载进度条为止
huggingface-cli download THUDM/chatglm3-6b --local-dir /useropt182/ChatGLM3-6B_hugginface
模型缓存 ,迁移到/useropt182/
mv /root/.cache /useropt182/
ln -s /useropt182/.cache /root/
制作模型一健启动脚本/useropt182/main.sh
vim /useropt182/main.sh
# >>> conda initialize >>>
Model_Folder="useropt182"
S_Dir=`ls / | grep user | xargs -I {} echo "/{}"`
echo $S_Dir
T_Dir="/$Model_Folder"
ln -s $S_Dir $T_Dir
# !! Contents within this block are managed by 'conda init' !!
__conda_setup="$('/$Model_Folder/miniconda3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
if [ $? -eq 0 ]; then
eval "$__conda_setup"
else
if [ -f "/$Model_Folder/miniconda3/etc/profile.d/conda.sh" ]; then
. "/$Model_Folder/miniconda3/etc/profile.d/conda.sh"
else
export PATH="/$Model_Folder/miniconda3/bin:$PATH"
fi
fi
unset __conda_setup
# <<< conda initialize <<<
# <<< Create a symlink<<<
target_folder="/root/.cache"
file_to_move="/root/.cachebak"
new_location="/$Model_Folder/.cache"
symlink_name="/root/"
if [ ! -L "$target_folder" ]; then
if [ -d "$target_folder" ]; then
mv "$target_folder" "$file_to_move"
echo "File $target_folder moved to $file_to_move"
else
echo "File $target_folder does not exist."
#exit 1
fi
ln -s "$new_location" "$symlink_name"
echo "Symlink $symlink_name created pointing to $new_location"
else
echo "$target_folder is already a symlink."
fi
# <<< Create a symlink<<<
conda activate /$Model_Folder/ChatGLM3-6B/env
export HF_ENDPOINT=https://hf-mirror.com
export MODEL_PATH=/$Model_Folder/ChatGLM3-6B_hugginface
streamlit run /$Model_Folder/ChatGLM3/composite_demo/main.py
运行模型
chmod +x /useropt182/main.sh
/useropt182/main.sh
验证
- 用一个可用区,基础应用(镜像) ubuntu22.04_v3, 不挂载数据盘,创建实例
- 将/useropt182 从制作镜像的实例中解绑
- 数据盘/useropt182挂载到新购买的实例中
- 直接运行/useropt182/main.sh ,ChatGLM3-6B模型运行正常,制作模型盘完成