设备规划
4台910B物理服务器,我也不知道这个叫910B还是920B来着。就叫910B得了
[root@ds-3 ~]# cat /etc/os-release
NAME="openEuler"
VERSION="22.03 LTS"
ID="openEuler"
VERSION_ID="22.03"
PRETTY_NAME="openEuler 22.03 LTS"
ANSI_COLOR="0;31"
[root@ds-3 ~]# uname -a
Linux ds-3 5.10.0-60.18.0.50.oe2203.aarch64 #1 SMP Wed Mar 30 02:43:08 UTC 2022 aarch64 aarch64 aarch64 GNU/Linux
[root@ds-3 ~]# lscpu
Architecture: aarch64
CPU op-mode(s): 64-bit
Byte Order: Little Endian
CPU(s): 192
On-line CPU(s) list: 0-191
Vendor ID: HiSilicon
BIOS Vendor ID: HiSilicon
Model name: Kunpeng-920
BIOS Model name: HUAWEI Kunpeng 920 5250
Model: 0
Thread(s) per core: 1
Core(s) per socket: 48
Socket(s): 4
Stepping: 0x1
Frequency boost: disabled
CPU max MHz: 2600.0000
CPU min MHz: 200.0000
ds-3是master节点
主机名称 | ip地址规划 | NPU卡IP地址 |
---|---|---|
ds-3 | 10.82.27.3 | 10.82.29.17~24 |
ds-4 | 10.82.27.4 | 10.82.29.15~32 |
ds-5 | 10.82.27.5 | 10.82.29.33~40 |
ds-6 | 10.82.27.6 | 10.82.29.41~48 |
安装NPU驱动
所有节点操作:驱动在对应社区
# 安装驱动,部分软件会安装不上,可忽略
yum -y install dkms gcc linux-header kernel-dev kernel-headers
rpm -ivh Ascend-hdk-910b-npu-driver-24.1.0-1.aarch64.rpm
# 更新固件
./Ascend-hdk-910b-npu-firmware_7.5.0.3.220.run --check
# 可能提示需要重启
./Ascend-hdk-910b-npu-firmware_7.5.0.3.220.run --full
# 检查驱动情况
/usr/local/Ascend/driver/tools/upgrade-tool --device_index -1 --component -1 --version
设置NPU的IP地址和NPUping检测
ds-3: 10.82.29.17~24
ds-4: 10.82.29.25~32
ds-5: 10.82.29.33~40
ds-6: 10.82.29.41~48
每个节点需要操作
hccn_tool -i 0 -ip -s address 10.82.29.33 netmask 255.255.255.0
hccn_tool -i 1 -ip -s address 10.82.29.34 netmask 255.255.255.0
hccn_tool -i 2 -ip -s address 10.82.29.35 netmask 255.255.255.0
hccn_tool -i 3 -ip -s address 10.82.29.36 netmask 255.255.255.0
hccn_tool -i 4 -ip -s address 10.82.29.37 netmask 255.255.255.0
hccn_tool -i 5 -ip -s address 10.82.29.38 netmask 255.255.255.0
hccn_tool -i 6 -ip -s address 10.82.29.39 netmask 255.255.255.0
hccn_tool -i 7 -ip -s address 10.82.29.40 netmask 255.255.255.0
# 给卡配置IP地址
for i in {0..7}; do hccn_tool -i $i -gateway -s gateway 10.82.29.254; done
for i in {0..7}; do hccn_tool -i $i -netdetect -s address 10.82.29.254; done
# 检测ip
for i in {17..24}; do hccn_tool -i 0 -ping -g address 10.82.29.$i pkt 3; done
# 获取卡的ip
for i in {0..7};do hccn_tool -i $i -ip -g; done
# 检查NPU底层tls校验行为一致性,建议全0,如果未配置,会导致模型加载超时
for i in {0..7}; do hccn_tool -i $i -tls -g ; done | grep switch
# NPU底层tls校验行为置0操作
for i in {0..7}; do hccn_tool -i $i -tls -s enable 0; done
rank_table_file.json
在宿主机所有节点使用此配置文件,注意文件权限
[root@ds-3 ~]# ll /data/rank_table_full.json
-rw-r----- 1 root root 3.3K Feb 20 21:31 /data/rank_table_full.json
{
"version": "1.0",
"server_count": "4",
"server_list": [
{
"server_id": "10.82.27.3",
"container_ip": "10.82.27.3",
"device": [
{ "device_id": "0", "device_ip": "10.82.29.17", "rank_id": "0" },
{ "device_id": "1", "device_ip": "10.82.29.18", "rank_id": "1" },
{ "device_id": "2", "device_ip": "10.82.29.19", "rank_id": "2" },
{ "device_id": "3", "device_ip": "10.82.29.20", "rank_id": "3" },
{ "device_id": "4", "device_ip": "10.82.29.21", "rank_id": "4" },
{ "device_id": "5", "device_ip": "10.82.29.22", "rank_id": "5" },
{ "device_id": "6", "device_ip": "10.82.29.23", "rank_id": "6" },
{ "device_id": "7", "device_ip": "10.82.29.24", "rank_id": "7" }
]
},
{
"server_id": "10.82.27.4",
"container_ip": "10.82.27.4",
"device": [
{ "device_id": "0", "device_ip": "10.82.29.25", "rank_id": "8" },
{ "device_id": "1", "device_ip": "10.82.29.26", "rank_id": "9" },
{ "device_id": "2", "device_ip": "10.82.29.27", "rank_id": "10" },
{ "device_id": "3", "device_ip": "10.82.29.28", "rank_id": "11" },
{ "device_id": "4", "device_ip": "10.82.29.29", "rank_id": "12" },
{ "device_id": "5", "device_ip": "10.82.29.30", "rank_id": "13" },
{ "device_id": "6", "device_ip": "10.82.29.31", "rank_id": "14" },
{ "device_id": "7", "device_ip": "10.82.29.32", "rank_id": "15" }
]
},
{
"server_id": "10.82.27.5",
"container_ip": "10.82.27.5",
"device": [
{ "device_id": "0", "device_ip": "10.82.29.33", "rank_id": "16" },
{ "device_id": "1", "device_ip": "10.82.29.34", "rank_id": "17" },
{ "device_id": "2", "device_ip": "10.82.29.35", "rank_id": "18" },
{ "device_id": "3", "device_ip": "10.82.29.36", "rank_id": "19" },
{ "device_id": "4", "device_ip": "10.82.29.37", "rank_id": "20" },
{ "device_id": "5", "device_ip": "10.82.29.38", "rank_id": "21" },
{ "device_id": "6", "device_ip": "10.82.29.39", "rank_id": "22" },
{ "device_id": "7", "device_ip": "10.82.29.40", "rank_id": "23" }
]
},
{
"server_id": "10.82.27.6",
"container_ip": "10.82.27.6",
"device": [
{ "device_id": "0", "device_ip": "10.82.29.41", "rank_id": "24" },
{ "device_id": "1", "device_ip": "10.82.29.42", "rank_id": "25" },
{ "device_id": "2", "device_ip": "10.82.29.43", "rank_id": "26" },
{ "device_id": "3", "device_ip": "10.82.29.44", "rank_id": "27" },
{ "device_id": "4", "device_ip": "10.82.29.45", "rank_id": "28" },
{ "device_id": "5", "device_ip": "10.82.29.46", "rank_id": "29" },
{ "device_id": "6", "device_ip": "10.82.29.47", "rank_id": "30" },
{ "device_id": "7", "device_ip": "10.82.29.48", "rank_id": "31" }
]
}
],
"status": "completed"
}
下载权重/模型
国外地址:https://huggingface.co/deepseek-ai/DeepSeek-R1/tree/main
国内地址:https://modelers.cn/spaces/State_Cloud/DeepSeek-R1/tree/main
NPU侧权重转换 目前npu转换脚本不会自动复制tokenizer等文件
git clone https://gitee.com/ascend/ModelZoo-PyTorch.git
cd ModelZoo-PyTorch\MindIE\LLM\DeepSeek\DeepSeek-V2\NPU_inference
python fp8_cast_bf16.py --input-fp8-hf-path /path/to/DeepSeek-R1 --output-bf16-hf-path /path/to/deepseek-R1-bf16
所有节点:在宿主机查看转换后的权重路径,文件大小为1.3T。注意设置权重权限,否则后期启动模型报错
docker容器NPU配置
运行容器
ds-3: 10.82.29.17~24
ds-4:10.82.29.25~32
ds-5: 10.82.29.33~40
ds-6: 10.82.29.41~48
官方的服务化测试和模型测试可以跳过,镜像在华为的社区,需要权限才能下载,大小为14G
#!/bin/bash
docker stop DeepSeek-R1-full
docker stop DeepSeek-R1-full
docker run -itd \
--privileged \
--name=DeepSeek-R1-full \
--net=host \
--shm-size 500g \
--device=/dev/davinci0 \
--device=/dev/davinci1 \
--device=/dev/davinci2 \
--device=/dev/davinci3 \
--device=/dev/davinci4 \
--device=/dev/davinci5 \
--device=/dev/davinci6 \
--device=/dev/davinci7 \
--device=/dev/davinci_manager \
--device=/dev/hisi_hdc \
--device=/dev/devmm_svm \
-v /etc/localtime:/etc/localtime \
-v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
-v /usr/local/Ascend/firmware:/usr/local/Ascend/firmware \
-v /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi \
-v /usr/local/sbin:/usr/local/sbin \
-v /etc/hccn.conf:/etc/hccn.conf \
-v /data:/data \
-e ATB_LLM_HCCL_ENABLE=1 \
-e ATB_LLM_COMM_BACKEND="hccl" \
-e HCCL_CONNECT_TIMEOUT=7200 \
-e WORLD_SIZE=32 \
-e HCCL_EXEC_TIMEOUT=0 \
-e PYTORCH_NPU_ALLOC_CONF=expandable_segments:True \
-e RANKTABLEFILE=/data/rank_table.json \
-e MIES_CONTAINER_IP=`hostname -I |awk '{print $1}'` \
-e OMP_NUM_THREADS=1 \
-e NPU_MEMORY_FRACTION=0.95 \
swr.cn-south-1.myhuaweicloud.com/ascendhub/mindie:2.0.T3-800I-A2-py311-openeuler24.03-lts \
bash
docker exec DeepSeek-R1 /bin/bash -c \
"mkdir -p /usr/local/Ascend/mindie/latest/mindie-service/conf; \
cat > /usr/local/Ascend/mindie/latest/mindie-service/conf/config.json <<'EOF'
{
"Version" : "1.0.0",
"LogConfig" :
{
"logLevel" : "Info",
"logFileSize" : 20,
"logFileNum" : 20,
"logPath" : "logs/mindie-server.log"
},
"ServerConfig" :
{
"ipAddress" : "10.82.27.3",
"managementIpAddress" : "10.82.27.3",
"port" : 1025,
"managementPort" : 1026,
"metricsPort" : 1027,
"allowAllZeroIpListening" : false,
"maxLinkNum" : 300,
"httpsEnabled" : false,
"fullTextEnabled" : false,
"tlsCaPath" : "security/ca/",
"tlsCaFile" : ["ca.pem"],
"tlsCert" : "security/certs/server.pem",
"tlsPk" : "security/keys/server.key.pem",
"tlsPkPwd" : "security/pass/key_pwd.txt",
"tlsCrlPath" : "security/certs/",
"tlsCrlFiles" : ["server_crl.pem"],
"managementTlsCaFile" : ["management_ca.pem"],
"managementTlsCert" : "security/certs/management/server.pem",
"managementTlsPk" : "security/keys/management/server.key.pem",
"managementTlsPkPwd" : "security/pass/management/key_pwd.txt",
"managementTlsCrlPath" : "security/management/certs/",
"managementTlsCrlFiles" : ["server_crl.pem"],
"kmcKsfMaster" : "tools/pmt/master/ksfa",
"kmcKsfStandby" : "tools/pmt/standby/ksfb",
"inferMode" : "standard",
"interCommTLSEnabled" : false,
"interCommPort" : 1121,
"interCommTlsCaPath" : "security/grpc/ca/",
"interCommTlsCaFiles" : ["ca.pem"],
"interCommTlsCert" : "security/grpc/certs/server.pem",
"interCommPk" : "security/grpc/keys/server.key.pem",
"interCommPkPwd" : "security/grpc/pass/key_pwd.txt",
"interCommTlsCrlPath" : "security/grpc/certs/",
"interCommTlsCrlFiles" : ["server_crl.pem"],
"openAiSupport" : "vllm"
},
"BackendConfig" : {
"backendName" : "mindieservice_llm_engine",
"modelInstanceNumber" : 1,
"npuDeviceIds" : [[0,1,2,3,4,5,6,7]],
"tokenizerProcessNumber" : 8,
"multiNodesInferEnabled" : true,
"multiNodesInferPort" : 1120,
"interNodeTLSEnabled" : false,
"interNodeTlsCaPath" : "security/grpc/ca/",
"interNodeTlsCaFiles" : ["ca.pem"],
"interNodeTlsCert" : "security/grpc/certs/server.pem",
"interNodeTlsPk" : "security/grpc/keys/server.key.pem",
"interNodeTlsPkPwd" : "security/grpc/pass/mindie_server_key_pwd.txt",
"interNodeTlsCrlPath" : "security/grpc/certs/",
"interNodeTlsCrlFiles" : ["server_crl.pem"],
"interNodeKmcKsfMaster" : "tools/pmt/master/ksfa",
"interNodeKmcKsfStandby" : "tools/pmt/standby/ksfb",
"ModelDeployConfig" :
{
"maxSeqLen" : 10000,
"maxInputTokenLen" : 2048,
"truncation" : true,
"ModelConfig" : [
{
"modelInstanceType" : "Standard",
"modelName" : "deepseekr1",
"modelWeightPath" : "/data/DeepSeek-R1-bf16",
"worldSize" : 8,
"cpuMemSize" : 5,
"npuMemSize" : -1,
"backendType" : "atb",
"trustRemoteCode" : false
}
]
},
"ScheduleConfig" :
{
"templateType" : "Standard",
"templateName" : "Standard_LLM",
"cacheBlockSize" : 128,
"maxPrefillBatchSize" : 8,
"maxPrefillTokens" : 2048,
"prefillTimeMsPerReq" : 150,
"prefillPolicyType" : 0,
"decodeTimeMsPerReq" : 50,
"decodePolicyType" : 0,
"maxBatchSize" : 8,
"maxIterTimes" : 1024,
"maxPreemptCount" : 0,
"supportSelectBatch" : false,
"maxQueueDelayMicroseconds" : 5000
}
}
}
EOF"
启动模型
所有节点操作:docker exec -it DeepSeek-R1 bash 。注意权重权限, rank_table_full.json
cd /usr/local/Ascend/mindie/latest/mindie-service/
./bin/mindieservice_daemon
其他节点的启动日志
显存占用明显升高
验证测试
[root@ds-3 ~]# curl 10.82.27.3:1025/generate -X POST -d '{"inputs":"题目:糖果的数量,小明、小红和小刚一共有24颗糖果。已知:小明比小红多2颗糖果;小红比小刚少4颗糖果。问题:小明、小红和小刚分别有多少颗糖果?","parameters":{"max_new_tokens":500},"temperature":0.3, "top_p":0.3, "top_k":5, "do_sample":true, "repetition_penalty":1.05, "seed":128}'
{"generated_text":"答案:小明有8颗,小红有6颗,小刚有10颗。解析:设小红有x颗糖果,则小明有x+2颗,小刚有x+4颗。根据总数为24颗,建立方程x + (x+2) + (x+4) = 24,解得x=6。因此,小明8颗,小红6颗,小刚10颗。<|end▁of▁sentence|>"}[root@ds-3 ~]#
ST -d ‘{“inputs”:“题目:糖果的数量,小明、小红和小刚一共有24颗糖果。已知:小明比小红多2颗糖果;小红比小刚少4颗糖果。问题:小明、小红和小刚分别有多少颗糖果?”,“parameters”:{“max_new_tokens”:500},“temperature”:0.3, “top_p”:0.3, “top_k”:5, “do_sample”:true, “repetition_penalty”:1.05, “seed”:128}’
{“generated_text”:“答案:小明有8颗,小红有6颗,小刚有10颗。解析:设小红有x颗糖果,则小明有x+2颗,小刚有x+4颗。根据总数为24颗,建立方程x + (x+2) + (x+4) = 24,解得x=6。因此,小明8颗,小红6颗,小刚10颗。<|end▁of▁sentence|>”}[root@ds-3 ~]#
``