RDMA-VERBS开发示例
1. RDMA开发模式
远程直接内存访问(即RDMA)是一种直接内存访问技术,它将数据直接从一台计算机的内存传输到另一台计算机,无需双方操作系统的介入。RDMA最早在Infiniband传输网络上实现,后来业界厂家把RDMA移植到传统Ethernet以太网上,降低了RDMA的使用成本,推动RDMA技术普及。RDMA主流有三种实现方式:Infiniband、RoCE、iWARP。
VERBS开发接口支持IB/iWARP/RoCE三大RDMA协议,通过统一接口,让同一份RDMA程序程序可以无视底层的硬件和链路差异运行在不同的环境中。
RDMA技术的特点这里不展开讲,网上有很多很好的资料,这篇文章主要展示使用VERBS接口RC模式最简收发的代码示例,流程如下:
主要分为以下几个阶段:
- 初始化阶段:发送端与接收端初始化QP阶段,在该阶段创建QP,并设置相应的参数;
- 地址交换阶段:通过手工、socket、cm等其他方式交换双方的“通信地址”,包括:lid、qpn、psn、gid;
- 连接阶段:获取到远程的地址信息,调用相应的API连接到远程的QP;
- 数据收发阶段:调用相应的接口在QP上进行数据收发;
- 资源回收阶段:当不需要再传输数据时,依次回收QP相关的资源。
2. 发送端代码
int verbs_send(int dev_idx, unsigned int length)
{
// 1. 获取设备列表
struct ibv_device** dev_list = ibv_get_device_list(NULL);
if (!dev_list)
{
perror("Failed to get IB devices list");
return -1;
}
// 2. 选择设备
struct ibv_device* target_dev = dev_list[dev_idx];
if (!target_dev)
{
fprintf(stderr, "No IB devices found\n");
return -2;
}
// 3. 打开设备
struct ibv_context* ctx = ibv_open_device(target_dev);
if (!ctx)
{
fprintf(stderr, "Couldn't get context for %s\n", ibv_get_device_name(target_dev));
return -3;
}
// 4. 分配保护域PD
struct ibv_pd* pd = ibv_alloc_pd(ctx);
if (!pd)
{
fprintf(stderr, "Couldn't allocate PD\n");
return -4;
}
// 5. 注册内存区域
unsigned int mr_size = 16 * 1048576;
void* mr_buffer = memalign(1024, mr_size);
struct ibv_mr* mr = ibv_reg_mr(pd, mr_buffer, mr_size, IBV_ACCESS_LOCAL_WRITE);
if (!mr)
{
fprintf(stderr, "Couldn't register MR\n");
return -5;
}
// 6. 创建CQ
unsigned int rx_depth = 500;
struct ibv_cq* cq = ibv_create_cq(ctx, rx_depth, NULL, NULL, 0);
if (!cq)
{
fprintf(stderr, "Couldn't create CQ\n");
return -6;
}
// 7. 创建QP
struct ibv_qp_init_attr init_attr = {
.send_cq = cq,
.recv_cq = cq,
.cap = {
.max_send_wr = 1,
.max_recv_wr = rx_depth,
.max_send_sge = 2,
.max_recv_sge = 2
},
.qp_type = IBV_QPT_RC
};
struct ibv_qp* qp = ibv_create_qp(pd, &init_attr);
if (!qp)
{
fprintf(stderr, "Couldn't create QP\n");
return -7;
}
// 8. 修改QP到INIT
unsigned char local_port_num = 1;
struct ibv_qp_attr attr = {
.qp_state = IBV_QPS_INIT,
.qp_access_flags = 0,
.pkey_index = 0,
.port_num = local_port_num
};
int ret = ibv_modify_qp(qp,
&attr,
IBV_QP_STATE |
IBV_QP_PKEY_INDEX |
IBV_QP_PORT |
IBV_QP_ACCESS_FLAGS);
if (ret != 0)
{
fprintf(stderr, "Failed to modify QP to INIT\n");
return -8;
}
// 查询lid信息
struct ibv_port_attr port_attr;
if (ibv_query_port(ctx, local_port_num, &port_attr))
{
fprintf(stderr, "Couldn't get port info\n");
return 1;
}
// 输出地址
int local_lid = port_attr.lid;
int local_qpn = qp->qp_num;
int local_psn = lrand48() & 0xffffff;
int local_gidx = 0;
union ibv_gid local_gid;
if (ibv_query_gid(ctx, local_port_num, local_gidx, &local_gid))
{
fprintf(stderr, "can't read sgid of index %d\n", local_gidx);
return 1;
}
printf("local address lid qpn psn gid: %04x %06x %06x %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",
local_lid, qp->qp_num, local_psn,
local_gid.raw[0], local_gid.raw[1], local_gid.raw[2], local_gid.raw[3],
local_gid.raw[4], local_gid.raw[5], local_gid.raw[6], local_gid.raw[7],
local_gid.raw[8], local_gid.raw[9], local_gid.raw[10], local_gid.raw[11],
local_gid.raw[12], local_gid.raw[13], local_gid.raw[14], local_gid.raw[15]);
// 9. socket获取其他方式交换连接信息,包括:
// - lid, 16 bit
// - qpn, 24 bit
// - psn, 24 bit
unsigned int remote_lid = 0;
unsigned int remote_qpn = 0;
unsigned int remote_psn = 0;
char remote_gidstr[33];
union ibv_gid remote_gid;
printf("input remote address info: ");
scanf("%x %x %x %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x",
&remote_lid, &remote_qpn, &remote_psn,
remote_gid.raw + 0, remote_gid.raw + 1, remote_gid.raw + 2, remote_gid.raw + 3,
remote_gid.raw + 4, remote_gid.raw + 5, remote_gid.raw + 6, remote_gid.raw + 7,
remote_gid.raw + 8, remote_gid.raw + 9, remote_gid.raw + 10, remote_gid.raw + 11,
remote_gid.raw + 12, remote_gid.raw + 13, remote_gid.raw + 14, remote_gid.raw + 15);
// 10. 修改QP状态为RTR
unsigned char local_sl = 3;
struct ibv_qp_attr rtr_attr = {
.qp_state = IBV_QPS_RTR,
.path_mtu = IBV_MTU_4096,
.rq_psn = remote_psn,
.dest_qp_num = remote_qpn,
.ah_attr = {
.dlid = (uint16_t)remote_lid,
.sl = local_sl,
.src_path_bits = 0,
.is_global = 0,
.port_num = local_port_num
},
.max_dest_rd_atomic = 1,
.min_rnr_timer = 12
};
if (remote_gid.global.interface_id)
{
rtr_attr.ah_attr.is_global = 1;
rtr_attr.ah_attr.grh.hop_limit = 1;
rtr_attr.ah_attr.grh.dgid = remote_gid;
rtr_attr.ah_attr.grh.sgid_index = local_gidx;
}
ret = ibv_modify_qp(qp,
&rtr_attr,
IBV_QP_STATE |
IBV_QP_AV |
IBV_QP_PATH_MTU |
IBV_QP_DEST_QPN |
IBV_QP_RQ_PSN |
IBV_QP_MAX_DEST_RD_ATOMIC |
IBV_QP_MIN_RNR_TIMER);
if (ret != 0)
{
fprintf(stderr, "Failed to modify QP to RTR %s\n", strerror(ret));
return -11;
}
// 11. 修改QP状态为RTS状态
struct ibv_qp_attr rts_attr = rtr_attr;
rts_attr.qp_state = IBV_QPS_RTS;
rts_attr.timeout = 14;
rts_attr.retry_cnt = 7;
rts_attr.rnr_retry = 7;
rts_attr.sq_psn = local_psn;
rts_attr.max_rd_atomic = 1;
ret = ibv_modify_qp(qp,
&rts_attr,
IBV_QP_STATE |
IBV_QP_TIMEOUT |
IBV_QP_RETRY_CNT |
IBV_QP_RNR_RETRY |
IBV_QP_SQ_PSN |
IBV_QP_MAX_QP_RD_ATOMIC);
if (ret != 0)
{
fprintf(stderr, "Failed to modify QP to RTS\n");
return -11;
}
// 12. 发送数据
unsigned int total_count = 10000000;
unsigned int flags = IBV_SEND_SIGNALED;
struct ibv_sge list[2];
list[0].length = 16;
list[0].addr = (uint64_t)mr_buffer;
list[0].lkey = mr->lkey;
sprintf((char*)list[0].addr, "%s", "data from rdma ");
memset((char*)list[0].addr, 'a', list[0].length);
list[1].length = length;
list[1].addr = (uint64_t)((char*)mr_buffer + 128);
memset((char*)list[1].addr, '\0', list[1].length);
list[1].lkey = mr->lkey;
for (int i = 0; i < total_count; ++i)
{
sprintf((char*)list[1].addr, "%d", i);
struct ibv_send_wr wr = {
.wr_id = (uint64_t)i,
.sg_list = list,
.num_sge = 2,
.opcode = IBV_WR_SEND,
.send_flags = flags,
};
struct ibv_send_wr *bad_wr;
ret = ibv_post_send(qp, &wr, &bad_wr);
if (ret != 0)
{
fprintf(stderr, "ibv_post_send failed %s.\n", strerror(errno));
}
// 13. pool cq
int ne = 0;
do
{
struct ibv_wc wc[1];
ne = ibv_poll_cq(cq, 1, wc);
if (ne < 0)
{
fprintf(stderr, "poll CQ failed %d\n", ne);
return 1;
}
} while (ne < 1);
}
return 0;
}
3. 接收端代码
int verbs_receive(int dev_idx)
{
// 1. 获取设备列表
struct ibv_device** dev_list = ibv_get_device_list(NULL);
if (!dev_list)
{
perror("Failed to get IB devices list");
return -1;
}
// 2. 选择设备
struct ibv_device* target_dev = dev_list[dev_idx];
if (!target_dev)
{
fprintf(stderr, "No IB devices found\n");
return -2;
}
// 3. 打开设备
struct ibv_context* ctx = ibv_open_device(target_dev);
if (!ctx)
{
fprintf(stderr, "Couldn't get context for %s\n", ibv_get_device_name(target_dev));
return -3;
}
// 4. 分配保护域PD
struct ibv_pd* pd = ibv_alloc_pd(ctx);
if (!pd)
{
fprintf(stderr, "Couldn't allocate PD\n");
return -4;
}
// 5. 注册内存区域
unsigned int mr_size = 16 * 1048576;
void* mr_buffer = memalign(1024, mr_size);
struct ibv_mr* mr = ibv_reg_mr(pd, mr_buffer, mr_size, IBV_ACCESS_LOCAL_WRITE);
if (!mr)
{
fprintf(stderr, "Couldn't register MR\n");
return -5;
}
// 6. 创建CQ
unsigned int rx_depth = 5000;
struct ibv_cq* cq = ibv_create_cq(ctx, rx_depth, NULL, NULL, 0);
if (!cq)
{
fprintf(stderr, "Couldn't create CQ\n");
return -6;
}
// 7. 创建QP
struct ibv_qp_init_attr init_attr = {
.send_cq = cq,
.recv_cq = cq,
.cap = {
.max_send_wr = 1,
.max_recv_wr = rx_depth,
.max_send_sge = 2,
.max_recv_sge = 2
},
.qp_type = IBV_QPT_RC
};
struct ibv_qp* qp = ibv_create_qp(pd, &init_attr);
if (!qp)
{
fprintf(stderr, "Couldn't create QP\n");
return -7;
}
// 8. 修改QP到INIT
unsigned char local_port_num = 1;
struct ibv_qp_attr attr = {
.qp_state = IBV_QPS_INIT,
.qp_access_flags = 0,
.pkey_index = 0,
.port_num = local_port_num
};
int ret = ibv_modify_qp(qp,
&attr,
IBV_QP_STATE |
IBV_QP_PKEY_INDEX |
IBV_QP_PORT |
IBV_QP_ACCESS_FLAGS);
if (ret != 0)
{
fprintf(stderr, "Failed to modify QP to INIT\n");
return -8;
}
// 9. POST RECV
struct ibv_sge list = {
.addr = (uint64_t)mr_buffer,
.length = mr_size,
.lkey = mr->lkey
};
struct ibv_recv_wr wr = {
.wr_id = 1,
.sg_list = &list,
.num_sge = 1,
};
struct ibv_recv_wr *bad_wr;
for (int i = 0; i < rx_depth; ++i)
{
ret = ibv_post_recv(qp, &wr, &bad_wr);
if (ret < 0)
{
fprintf(stderr, "Failed to ibv_post_recv\n");
return -9;
}
}
// 查询lid信息
struct ibv_port_attr port_attr;
if (ibv_query_port(ctx, local_port_num, &port_attr))
{
fprintf(stderr, "Couldn't get port info\n");
return 1;
}
// 输出地址
int local_lid = port_attr.lid;
int local_qpn = qp->qp_num;
int local_psn = lrand48() & 0xffffff;
int local_gidx = 0;
union ibv_gid local_gid;
if (ibv_query_gid(ctx, local_port_num, local_gidx, &local_gid))
{
fprintf(stderr, "can't read sgid of index %d\n", local_gidx);
return 1;
}
printf("local address lid qpn psn gid: %04x %06x %06x %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",
local_lid, qp->qp_num, local_psn,
local_gid.raw[0], local_gid.raw[1], local_gid.raw[2], local_gid.raw[3],
local_gid.raw[4], local_gid.raw[5], local_gid.raw[6], local_gid.raw[7],
local_gid.raw[8], local_gid.raw[9], local_gid.raw[10], local_gid.raw[11],
local_gid.raw[12], local_gid.raw[13], local_gid.raw[14], local_gid.raw[15]);
// 连接到发送端
unsigned int remote_lid = 0;
unsigned int remote_qpn = 0;
unsigned int remote_psn = 0;
union ibv_gid remote_gid;
printf("input remote address info: ");
scanf("%x %x %x %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x",
&remote_lid, &remote_qpn, &remote_psn,
remote_gid.raw + 0, remote_gid.raw + 1, remote_gid.raw + 2, remote_gid.raw + 3,
remote_gid.raw + 4, remote_gid.raw + 5, remote_gid.raw + 6, remote_gid.raw + 7,
remote_gid.raw + 8, remote_gid.raw + 9, remote_gid.raw + 10, remote_gid.raw + 11,
remote_gid.raw + 12, remote_gid.raw + 13, remote_gid.raw + 14, remote_gid.raw + 15);
// 10. 修改QP为RTR
unsigned char local_sl = 3;
struct ibv_qp_attr rtr_attr = {
.qp_state = IBV_QPS_RTR,
.path_mtu = IBV_MTU_4096,
.rq_psn = remote_psn,
.dest_qp_num = remote_qpn,
.ah_attr = {
.dlid = (uint16_t)remote_lid,
.sl = local_sl,
.src_path_bits = 0,
.is_global = 0,
.port_num = local_port_num
},
.max_dest_rd_atomic = 1,
.min_rnr_timer = 12
};
if (remote_gid.global.interface_id)
{
rtr_attr.ah_attr.is_global = 1;
rtr_attr.ah_attr.grh.hop_limit = 1;
rtr_attr.ah_attr.grh.dgid = remote_gid;
rtr_attr.ah_attr.grh.sgid_index = local_gidx;
}
ret = ibv_modify_qp(qp,
&rtr_attr,
IBV_QP_STATE |
IBV_QP_AV |
IBV_QP_PATH_MTU |
IBV_QP_DEST_QPN |
IBV_QP_RQ_PSN |
IBV_QP_MAX_DEST_RD_ATOMIC |
IBV_QP_MIN_RNR_TIMER);
if (ret != 0)
{
fprintf(stderr, "Failed to modify QP to RTR %s\n", strerror(ret));
return -11;
}
// 11. 修改QP状态为RTS状态
struct ibv_qp_attr rts_attr = rtr_attr;
rts_attr.qp_state = IBV_QPS_RTS;
rts_attr.timeout = 14;
rts_attr.retry_cnt = 7;
rts_attr.rnr_retry = 7;
rts_attr.sq_psn = local_psn;
rts_attr.max_rd_atomic = 1;
ret = ibv_modify_qp(qp,
&rts_attr,
IBV_QP_STATE |
IBV_QP_TIMEOUT |
IBV_QP_RETRY_CNT |
IBV_QP_RNR_RETRY |
IBV_QP_SQ_PSN |
IBV_QP_MAX_QP_RD_ATOMIC);
if (ret != 0)
{
fprintf(stderr, "Failed to modify QP to RTS\n");
return -11;
}
unsigned int printGap = 1;
unsigned int count = 0;
static uint64_t s_lastTime = 0;
while (true)
{
struct ibv_wc wc[1];
int ne = ibv_poll_cq(cq, 1, wc);
if (ne < 0)
{
fprintf(stderr, "poll CQ failed %d\n", ne);
return 1;
}
if (ne == 0)
{
continue;
}
wc[0].status;
// 依次处理wc
if (count++ % printGap == 0)
{
printf("recv length %d content %s \n", wc[0].byte_len, mr_buffer);
}
// 9. POST RECV
ret = ibv_post_recv(qp, &wr, &bad_wr);
if (ret < 0)
{
fprintf(stderr, "Failed to ibv_post_recv\n");
return -9;
}
}
return 0;
}
4. 总控代码
#include <infiniband/verbs.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <malloc.h>
#include <unistd.h>
// verbs_receive
// verbs_send
void printHelp()
{
printf("./rdma_verbs_example r \n");
printf("./rdma_verbs_example s length\n");
}
int main(int argc, char* argv[])
{
if (argc < 2)
{
printHelp();
return -1;
}
// 按照真实的环境修改
int dev_index = 1;
int is_send = argv[1][0] == 's';
if (is_send)
{
if (argc < 3)
{
printHelp();
return -1;
}
unsigned int length = 10240;
sscanf(argv[2], "%u", &length);
verbs_send(dev_index, length);
}
else
{
verbs_receive(dev_index);
}
return 0;
}
4. 编译运行
- 把代码保存到rdma_example.cpp,根据自身的网卡信息,修改main函数中的dev_index;
- g++ rdma_example.cpp -lverbs -o rdma_example
- 接收端:./rdma_example r
- 发送端:./rdma_example s 16384
- 手动输入对端输出的地址
- 观察结果
5. 参考资料
- https://github.com/linux-rdma/rdma-core/tree/master/libibverbs/examples
- https://docs.nvidia.com/rdma-aware-networks-programming-user-manual-1-7.pdf