1、所有节点通过eth完成OFED安装。其中管理节点(或最先加电的节点)需要安装ALL并开启opensm,其余节点只开启openibd即可;
2、在管理节点上,执行如下:
[root@login01 ~]# mst start ##开启mst
Starting MST (Mellanox Software Tools) driver set
Loading MST PCI module - Success
[warn] mst_pciconf is already loaded, skipping
Create devices
-W- Missing "lsusb" command, skipping MTUSB devices detection
Unloading MST PCI module (unused) - Success
[root@login01 ~]# mst ib add ##检索IB设备并加入数据库
-I- Discovering the fabric - Running: ibdiagnet -skip all
-I- Added 44 in-band devices ###发现44个带内设备
[root@login01 ~]# mst status ##检查设备状态和清单
MST modules:
------------
MST PCI module is not loaded
MST PCI configuration module loaded
MST devices:
------------
/dev/mst/mt4123_pciconf0 - PCI configuration cycles access.
domain:bus:dev.fn=0000:b1:00.0 addr.reg=88 data.reg=92 cr_bar.gw_offset=-1
Chip revision is: 00
Inband devices:
-------------------
/dev/mst/CA_MT4123_mgr_HCA-1_lid-0x0018
/dev/mst/CA_MT4123_node10_HCA-1_lid-0x0023
/dev/mst/CA_MT4123_node11_HCA-1_lid-0x0014
/dev/mst/CA_MT4123_node12_HCA-1_lid-0x0027
/dev/mst/CA_MT4123_node13_HCA-1_lid-0x000D
/dev/mst/CA_MT4123_node14_HCA-1_lid-0x0022
/dev/mst/CA_MT4123_node15_HCA-1_lid-0x0015
/dev/mst/CA_MT4123_node16_HCA-1_lid-0x002C
/dev/mst/CA_MT4123_node17_HCA-1_lid-0x000A
/dev/mst/CA_MT4123_node18_HCA-1_lid-0x002A
/dev/mst/CA_MT4123_node19_HCA-1_lid-0x000E
/dev/mst/CA_MT4123_node1_HCA-1_lid-0x0010
/dev/mst/CA_MT4123_node20_HCA-1_lid-0x0020
/dev/mst/CA_MT4123_node21_HCA-1_lid-0x0005
/dev/mst/CA_MT4123_node22_HCA-1_lid-0x001E
/dev/mst/CA_MT4123_node23_HCA-1_lid-0x0011
/dev/mst/CA_MT4123_node24_HCA-1_lid-0x0028
/dev/mst/CA_MT4123_node25_HCA-1_lid-0x000F
/dev/mst/CA_MT4123_node26_HCA-1_lid-0x0025
/dev/mst/CA_MT4123_node27_HCA-1_lid-0x0012
/dev/mst/CA_MT4123_node28_HCA-1_lid-0x0021
/dev/mst/CA_MT4123_node29_HCA-1_lid-0x0007
/dev/mst/CA_MT4123_node2_HCA-1_lid-0x001F
/dev/mst/CA_MT4123_node30_HCA-1_lid-0x0026
/dev/mst/CA_MT4123_node31_HCA-1_lid-0x001B
/dev/mst/CA_MT4123_node32_HCA-1_lid-0x0016
/dev/mst/CA_MT4123_node3_HCA-1_lid-0x000C
/dev/mst/CA_MT4123_node4_HCA-1_lid-0x002B
/dev/mst/CA_MT4123_node5_HCA-1_lid-0x0008
/dev/mst/CA_MT4123_node6_HCA-1_lid-0x0029
/dev/mst/CA_MT4123_node7_HCA-1_lid-0x0013
/dev/mst/CA_MT4123_node8_HCA-1_lid-0x0024
/dev/mst/CA_MT4123_node9_HCA-1_lid-0x0006
/dev/mst/CA_MT4123_fat1_HCA-1_lid-0x0002
/dev/mst/CA_MT4123_gpu1_HCA-1_lid-0x0003
/dev/mst/CA_MT4123_gpu2_HCA-1_lid-0x001D
/dev/mst/CA_MT4123_gpu3_HCA-1_lid-0x001C
/dev/mst/CA_MT4123_gpu4_HCA-1_lid-0x0004
/dev/mst/CA_MT4123_localhost_mlx5_0_lid-0x0019 ##主机名为localhost,且驱动未更新
/dev/mst/CA_MT4123_localhost_mlx5_0_lid-0x001A
/dev/mst/CA_MT4123_login01_HCA-1_lid-0x0001
/dev/mst/CA_MT4123_mgs3_HCA-1_lid-0x0009
/dev/mst/CA_MT53001_Mellanox_Technologies_Aggregation_Node_lid-0x0017
/dev/mst/SW_MT54000_Quantum_Mellanox_Technologies_lid-0x000B ##交换机
[root@login01 ~]# mlxconfig -d /dev/mst/SW_MT54000_Quantum_Mellanox_Technologies_lid-0x000B query SPLIT_MODE ##查询SPLIT_MODE状态,默认为0(未开启)
Device #1:
----------
Device type: Quantum
Name: MQM8790-HS2X_Ax
Description: Mellanox Quantum(TM) HDR InfiniBand Switch; 40 QSFP56 ports; 2 Power Supplies (AC); unmanaged; standard depth; P2C airflow; Rail Kit; RoHS6
Device: /dev/mst/SW_MT54000_Quantum_Mellanox_Technologies_lid-0x000B
Configurations: Next Boot
SPLIT_MODE none
[root@login01 ~]# mlxconfig -d /dev/mst/SW_MT54000_Quantum_Mellanox_Technologies_lid-0x000B set SPLIT_MODE=1 ##开启SPLIT_MODE
Device #1:
----------
Device type: Quantum
Name: MQM8790-HS2X_Ax
Description: Mellanox Quantum(TM) HDR InfiniBand Switch; 40 QSFP56 ports; 2 Power Supplies (AC); unmanaged; standard depth; P2C airflow; Rail Kit; RoHS6
Device: /dev/mst/SW_MT54000_Quantum_Mellanox_Technologies_lid-0x000B
Configurations: Next Boot
SPLIT_MODE SPLIT_2X(1)
[root@login01 ~]# mlxconfig -d /dev/mst/SW_MT54000_Quantum_Mellanox_Technologies_lid-0x000B set SPLIT_PORT[1..38]=1 ##将1-38号端口配置为split状态(即HDR100模式),39、40保持HDR不变。
Device #1:
----------
Device type: Quantum
Name: MQM8790-HS2X_Ax
Description: Mellanox Quantum(TM) HDR InfiniBand Switch; 40 QSFP56 ports; 2 Power Supplies (AC); unmanaged; standard depth; P2C airflow; Rail Kit; RoHS6
Device: /dev/mst/SW_MT54000_Quantum_Mellanox_Technologies_lid-0x000B
Configurations: Next Boot
SPLIT_PORT[1] SPLIT_2X(1)
SPLIT_PORT[2] SPLIT_2X(1)
SPLIT_PORT[3] SPLIT_2X(1)
SPLIT_PORT[4] SPLIT_2X(1)
SPLIT_PORT[5] SPLIT_2X(1)
SPLIT_PORT[6] SPLIT_2X(1)
SPLIT_PORT[7] SPLIT_2X(1)
SPLIT_PORT[8] SPLIT_2X(1)
SPLIT_PORT[9] SPLIT_2X(1)
SPLIT_PORT[10] SPLIT_2X(1)
SPLIT_PORT[11] SPLIT_2X(1)
SPLIT_PORT[12] SPLIT_2X(1)
SPLIT_PORT[13] SPLIT_2X(1)
SPLIT_PORT[14] SPLIT_2X(1)
SPLIT_PORT[15] SPLIT_2X(1)
SPLIT_PORT[16] SPLIT_2X(1)
SPLIT_PORT[17] SPLIT_2X(1)
SPLIT_PORT[18] SPLIT_2X(1)
SPLIT_PORT[19] SPLIT_2X(1)
SPLIT_PORT[20] SPLIT_2X(1)
SPLIT_PORT[21] SPLIT_2X(1)
SPLIT_PORT[22] SPLIT_2X(1)
SPLIT_PORT[23] SPLIT_2X(1)
SPLIT_PORT[24] SPLIT_2X(1)
SPLIT_PORT[25] SPLIT_2X(1)
SPLIT_PORT[26] SPLIT_2X(1)
SPLIT_PORT[27] SPLIT_2X(1)
SPLIT_PORT[28] SPLIT_2X(1)
SPLIT_PORT[29] SPLIT_2X(1)
SPLIT_PORT[30] SPLIT_2X(1)
SPLIT_PORT[31] SPLIT_2X(1)
SPLIT_PORT[32] SPLIT_2X(1)
SPLIT_PORT[33] SPLIT_2X(1)
SPLIT_PORT[34] SPLIT_2X(1)
SPLIT_PORT[35] SPLIT_2X(1)
SPLIT_PORT[36] SPLIT_2X(1)
SPLIT_PORT[37] SPLIT_2X(1)
SPLIT_PORT[38] SPLIT_2X(1)
SPLIT_PORT[39] NO_SPLIT_SUPPORT(0)
SPLIT_PORT[40] NO_SPLIT_SUPPORT(0)
[root@login01 ~]# flint -d /dev/mst/SW_MT54000_Quantum_Mellanox_Technologies_lid-0x000B swreset ##重启IB交换机
重启完成后:
[root@login01 ~]# mst -h ##查看mst其它功能
Usage:
/usr/bin/mst {start|stop|status|remote|server|restart|save|load|rm|add|help|version}
Type "/usr/bin/mst help" for detailed help
[root@login01 ~]# mst save ##保存配置
[root@login01 ~]# mst status ##再次查询状态
MST modules:
------------
MST PCI module is not loaded
MST PCI configuration module loaded
MST devices:
------------
/dev/mst/mt4123_pciconf0 - PCI configuration cycles access.
domain:bus:dev.fn=0000:b1:00.0 addr.reg=88 data.reg=92 cr_bar.gw_offset=-1
Chip revision is: 00
Inband devices:
-------------------
/dev/mst/CA_MT4123_admin01_HCA-1_lid-0x0018
/dev/mst/CA_MT4123_node10_HCA-1_lid-0x0023
/dev/mst/CA_MT4123_node11_HCA-1_lid-0x0014
/dev/mst/CA_MT4123_node12_HCA-1_lid-0x0027
/dev/mst/CA_MT4123_node13_HCA-1_lid-0x000D
/dev/mst/CA_MT4123_node14_HCA-1_lid-0x0022
/dev/mst/CA_MT4123_node15_HCA-1_lid-0x0015
/dev/mst/CA_MT4123_node16_HCA-1_lid-0x002C
/dev/mst/CA_MT4123_node17_HCA-1_lid-0x000A
/dev/mst/CA_MT4123_node18_HCA-1_lid-0x002A
/dev/mst/CA_MT4123_node19_HCA-1_lid-0x000E
/dev/mst/CA_MT4123_node1_HCA-1_lid-0x0010
/dev/mst/CA_MT4123_node20_HCA-1_lid-0x0020
/dev/mst/CA_MT4123_node21_HCA-1_lid-0x0005
/dev/mst/CA_MT4123_node22_HCA-1_lid-0x001E
/dev/mst/CA_MT4123_node23_HCA-1_lid-0x0011
/dev/mst/CA_MT4123_node24_HCA-1_lid-0x0028
/dev/mst/CA_MT4123_node25_HCA-1_lid-0x000F
/dev/mst/CA_MT4123_node26_HCA-1_lid-0x0025
/dev/mst/CA_MT4123_node27_HCA-1_lid-0x0012
/dev/mst/CA_MT4123_node28_HCA-1_lid-0x0021
/dev/mst/CA_MT4123_node29_HCA-1_lid-0x0007
/dev/mst/CA_MT4123_node2_HCA-1_lid-0x001F
/dev/mst/CA_MT4123_node30_HCA-1_lid-0x0026
/dev/mst/CA_MT4123_node31_HCA-1_lid-0x001B
/dev/mst/CA_MT4123_node32_HCA-1_lid-0x0016
/dev/mst/CA_MT4123_node3_HCA-1_lid-0x000C
/dev/mst/CA_MT4123_node4_HCA-1_lid-0x002B
/dev/mst/CA_MT4123_node5_HCA-1_lid-0x0008
/dev/mst/CA_MT4123_node6_HCA-1_lid-0x0029
/dev/mst/CA_MT4123_node7_HCA-1_lid-0x0013
/dev/mst/CA_MT4123_node8_HCA-1_lid-0x0024
/dev/mst/CA_MT4123_node9_HCA-1_lid-0x0006
/dev/mst/CA_MT4123_fat1_HCA-1_lid-0x0002
/dev/mst/CA_MT4123_gpu1_HCA-1_lid-0x0003
/dev/mst/CA_MT4123_gpu2_HCA-1_lid-0x001D
/dev/mst/CA_MT4123_gpu3_HCA-1_lid-0x001C
/dev/mst/CA_MT4123_gpu4_HCA-1_lid-0x0004
/dev/mst/CA_MT4123_localhost_mlx5_0_lid-0x0019
/dev/mst/CA_MT4123_localhost_mlx5_0_lid-0x001A
/dev/mst/CA_MT4123_login01_HCA-1_lid-0x0001
/dev/mst/CA_MT4123_mgs3_HCA-1_lid-0x0009
/dev/mst/CA_MT53001_Mellanox_Technologies_Aggregation_Node_lid-0x0017
/dev/mst/SW_MT54000_Quantum_Mellanox_Technologies_lid-0x000B
[root@login01 ~]# mst stop ##关闭MST配置
Stopping MST (Mellanox Software Tools) driver set
Unloading MST PCI configuration module - Success
[root@login01 ~]# ibhosts ##检查节点是否正常