ktransformers DeepSeek-R1 671B HEAD 7a19f3b

mixboot

已于 2025-03-06 10:04:20 修改

阅读量434

点赞数 6

分类专栏： AI 文章标签： ktransformers DeepSeek-R1 671B

于 2025-03-06 10:02:41 首次发布

本文链接：https://blog.csdn.net/u010953692/article/details/146036450

版权

AI 专栏收录该内容

11 篇文章

订阅专栏

ktransformers DeepSeek-R1 671B HEAD 7a19f3b

HEAD 7a19f3b
大型语言模型
安装依赖
下载源码
创建虚拟环境
实验性使用flashinfer替代triton
查询GPU计算架构
设置TORCH_CUDA_ARCH_LIST环境变量
编译ktransformers
安装完成
运行ktransformers
- 本地对话
参考

HEAD 7a19f3b

大型语言模型

https://www.modelscope.cn/models/unsloth/DeepSeek-R1-GGUF/files

unsloth/DeepSeek-R1-GGUF
DeepSeek-R1-Q4_K_M

安装依赖

sudo apt-get install git
curl -LsSf https://astral.sh/uv/install.sh | sh

下载源码

git clone https://github.com/kvcache-ai/ktransformers.git
cd ktransformers
git submodule init # 初始化本地配置文件，注册子模块
git submodule update # 克隆尚未克隆的子模块仓库，检出到主仓库中指定的提交点
git checkout 7a19f3b
git rev-parse --short HEAD

创建虚拟环境

cd ..
uv venv ./venv --python 3.11 --python-preference=only-managed
source  venv/bin/activate

实验性使用flashinfer替代triton

uv pip install flashinfer-python

# uv pip install flashinfer-python
Using Python 3.11.11 environment at: venv
Resolved 26 packages in 2.16s
      Built flashinfer-python==0.2.2.post1
Prepared 1 package in 3.25s
░░░░░░░░░░░░░░░░░░░░ [0/26] Installing wheels...                                                                                                    warning: Failed to hardlink files; falling back to full copy. This may lead to degraded performance.
         If the cache and target directories are on different filesystems, hardlinking may not be supported.
         If this is intentional, set `export UV_LINK_MODE=copy` or use `--link-mode=copy` to suppress this warning.
Installed 26 packages in 14.92s
 + filelock==3.17.0
 + flashinfer-python==0.2.2.post1
 + fsspec==2025.2.0
 + jinja2==3.1.5
 + markupsafe==3.0.2
 + mpmath==1.3.0
 + networkx==3.4.2
 + ninja==1.11.1.3
 + numpy==2.2.3
 + nvidia-cublas-cu12==12.4.5.8
 + nvidia-cuda-cupti-cu12==12.4.127
 + nvidia-cuda-nvrtc-cu12==12.4.127
 + nvidia-cuda-runtime-cu12==12.4.127
 + nvidia-cudnn-cu12==9.1.0.70
 + nvidia-cufft-cu12==11.2.1.3
 + nvidia-curand-cu12==10.3.5.147
 + nvidia-cusolver-cu12==11.6.1.9
 + nvidia-cusparse-cu12==12.3.1.170
 + nvidia-cusparselt-cu12==0.6.2
 + nvidia-nccl-cu12==2.21.5
 + nvidia-nvjitlink-cu12==12.4.127
 + nvidia-nvtx-cu12==12.4.127
 + sympy==1.13.1
 + torch==2.6.0
 + triton==3.2.0
 + typing-extensions==4.12.2

查询GPU计算架构

nvidia-smi --query-gpu=compute_cap --format=csv

# nvidia-smi --query-gpu=compute_cap --format=csv
compute_cap
8.9
8.9

设置TORCH_CUDA_ARCH_LIST环境变量

export TORCH_CUDA_ARCH_LIST="8.9"

编译ktransformers

cd ktransformers/

sudo apt-get install build-essential cmake

uv pip install -r requirements-local_chat.txt
uv pip install setuptools wheel packaging

# 如果拥有充足的CPU核心和内存资源，可显著提升构建速度
export MAX_JOBS=8
export CMAKE_BUILD_PARALLEL_LEVEL=8

# 安装flash_attn
uv pip install flash_attn --no-build-isolation


# 可选实验性使用flashinfer替代triton
uv pip install flashinfer-python

# 仅适用于以下情况：
# 配备Intel双路CPU且内存>1TB可容纳两份完整模型内存副本(每路CPU一份副本)
# AMD EPYC NPS0双路平台可能无需此配置？
export USE_NUMA=1

# 安装ktransformers
KTRANSFORMERS_FORCE_BUILD=TRUE uv pip install . --no-build-isolation

# 如需重新编译，先执行清理操作：
uv pip uninstall ktransformers
rm -rf ktransformers/ktransformers_ext/build
rm -rf ktransformers/ktransformers_ext/cuda/build
rm -rf ktransformers/ktransformers_ext/cuda/dist
rm -rf ktransformers/ktransformers_ext/cuda/*.egg-info

安装完成

# KTRANSFORMERS_FORCE_BUILD=TRUE uv pip install . --no-build-isolation
Installed 34 packages in 289ms
 + accelerate==1.4.0
 + annotated-types==0.7.0
 + anyio==4.8.0
 + blessed==1.20.0
 + build==1.2.2.post1
 + click==8.1.8
 + colorlog==6.9.0
 + fastapi==0.115.11
 + greenlet==3.1.1
 + h11==0.14.0
 + httpcore==1.0.7
 + httpx==0.28.1
 + jsonpatch==1.33
 + jsonpointer==3.0.0
 + ktransformers==0.2.2rc1+cu126torch26fancy 
 + langchain==0.3.20
 + langchain-core==0.3.41
 + langchain-text-splitters==0.3.6
 + langsmith==0.3.11
 + orjson==3.10.15
 + psutil==7.0.0
 + pydantic==2.10.6
 + pydantic-core==2.27.2
 + pyproject-hooks==1.2.0
 + requests-toolbelt==1.0.0
 + sentencepiece==0.2.0
 + six==1.17.0
 + sniffio==1.3.1
 + sqlalchemy==2.0.38
 + starlette==0.46.0
 + tenacity==9.0.0
 + uvicorn==0.34.0
 + wcwidth==0.2.13
 + zstandard==0.23.0

运行ktransformers

本地对话

单核CPU

python ktransformers/ktransformers/local_chat.py --model_path deepseek-ai/DeepSeek-R1 --gguf_path /deepseek/DeepSeek-R1-Q4_K_M/ --cpu_infer 36 --cache_lens=32768 --max_new_tokens 8192

双核CPU

python ktransformers/ktransformers/local_chat.py --model_path deepseek-ai/DeepSeek-R1 --gguf_path /deepseek/DeepSeek-R1-Q4_K_M/ --cpu_infer 73 --cache_lens=32768 --max_new_tokens 8192

使用双显卡

python ktransformers/ktransformers/local_chat.py --model_path deepseek-ai/DeepSeek-R1 --gguf_path /deepseek/DeepSeek-R1-Q4_K_M/ --cpu_infer 73 --optimize_config_path ktransformers/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-multi-gpu.yaml --cache_lens=32768 --max_new_tokens 8192

# 按需修改参数，使用 `--help` 查看帮助文档
# 支持多GPU配置及通过 `--optimize_config_path` 进行更细粒度的显存卸载设置
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True python3 ktransformers/server/main.py \
    --gguf_path /mnt/ai/models/unsloth/DeepSeek-R1-GGUF/DeepSeek-R1-UD-Q2_K_XL/ \
    --model_path deepseek-ai/DeepSeek-R1 \
    --model_name unsloth/DeepSeek-R1-UD-Q2_K_XL \
    --cpu_infer 16 \
    --max_new_tokens 8192 \
    --cache_lens 32768 \
    --total_context 32768 \
    --cache_q4 true \
    --temperature 0.6 \
    --top_p 0.95 \
    --optimize_config_path ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat.yaml \
    --force_think \
    --use_cuda_graph \
    --host 127.0.0.1 \
    --port 8080