问题
在conda安装cuda环境时
mamba install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia -y
mamba install nvidia/label/cuda-12.1.0::cuda
安装的envs/xxx/lib/libcudart.so 会link到错的
如果手动修复太复杂,我写了一个脚本自动修复
脚本
#!/bin/bash
# 检查是否提供了环境名称作为参数
if [ "$#" -ne 1 ]; then
echo "Usage: $0 <environment_name>"
exit 1
fi
env_name=$1
env_path="/mnt/data/wangziyi/miniconda3/envs/$env_name/lib"
# 检查环境路径是否存在
if [ ! -d "$env_path" ]; then
echo "Error: The environment path $env_path does not exist."
exit 1
fi
# 使用find命令和正则表达式来查找最长的libcudart.so文件名
cudart_so_file=$(find "$env_path" -type f -regextype posix-extended -regex "$env_path/libcudart\.so\.[0-9]+\.[0-9]+\.[0-9]+" -printf '%f\n' | sort -rV | head -n 1)
# 检查是否找到了文件
if [ -z "$cudart_so_file" ]; then
echo "Error: No file matching libcudart.so.<version> was found in the environment."
exit 1
fi
# 创建或更新libcudart.so的符号链接
ln -sf "$env_path/$cudart_so_file" "$env_path/libcudart.so"
echo "Symbol link for libcudart.so has been updated to $cudart_so_file."
新版
function fixcuda() {
# 检查是否提供了环境名称作为参数
if [ "$#" -ne 1 ]; then
# 如果没有提供参数,使用环境变量CONDA_DEFAULT_ENV的值
env_name=${CONDA_DEFAULT_ENV:-"base"}
else
env_name=$1
fi
env_path="$HOME/miniconda3/envs/$env_name/lib"
echo "Environment path: $env_path"
ls -l $env_path | grep cuda
# 检查环境路径是否存在
if [ ! -d "$env_path" ]; then
echo "Error: The environment path $env_path does not exist."
exit 1
fi
# 使用find命令和正则表达式来查找最长的libcudart.so文件名
cudart_so_file=$(find "$env_path" -type f -regextype posix-extended -regex "$env_path/libcudart\.so\.[0-9]+\.[0-9]+\.[0-9]+" -printf '%f\n' | sort -rV | head -n 1)
# 检查是否找到了文件
if [ -z "$cudart_so_file" ]; then
echo "Error: No file matching libcudart.so.<version> was found in the environment."
exit 1
fi
# 创建或更新libcudart.so的符号链接
ln -sf "$env_path/$cudart_so_file" "$env_path/libcudart.so"
echo "Symbol link for libcudart.so has been updated to $cudart_so_file."
}