RDMA-VERBS开发示例

RDMA-VERBS开发示例

1. RDMA开发模式

远程直接内存访问(即RDMA)是一种直接内存访问技术,它将数据直接从一台计算机的内存传输到另一台计算机,无需双方操作系统的介入。RDMA最早在Infiniband传输网络上实现,后来业界厂家把RDMA移植到传统Ethernet以太网上,降低了RDMA的使用成本,推动RDMA技术普及。RDMA主流有三种实现方式:Infiniband、RoCE、iWARP。

VERBS开发接口支持IB/iWARP/RoCE三大RDMA协议,通过统一接口,让同一份RDMA程序程序可以无视底层的硬件和链路差异运行在不同的环境中。

RDMA技术的特点这里不展开讲,网上有很多很好的资料,这篇文章主要展示使用VERBS接口RC模式最简收发的代码示例,流程如下:

RDMA-RC开发流程

主要分为以下几个阶段:

  • 初始化阶段:发送端与接收端初始化QP阶段,在该阶段创建QP,并设置相应的参数;
  • 地址交换阶段:通过手工、socket、cm等其他方式交换双方的“通信地址”,包括:lid、qpn、psn、gid;
  • 连接阶段:获取到远程的地址信息,调用相应的API连接到远程的QP;
  • 数据收发阶段:调用相应的接口在QP上进行数据收发;
  • 资源回收阶段:当不需要再传输数据时,依次回收QP相关的资源。

2. 发送端代码

int verbs_send(int dev_idx, unsigned int length)
{
    // 1. 获取设备列表
    struct ibv_device** dev_list = ibv_get_device_list(NULL);
    if (!dev_list) 
    {
        perror("Failed to get IB devices list");
        return -1;
    }
    // 2. 选择设备
    struct ibv_device* target_dev = dev_list[dev_idx];
    if (!target_dev)
    {
        fprintf(stderr, "No IB devices found\n");
        return -2;
    }
    // 3. 打开设备
    struct ibv_context* ctx = ibv_open_device(target_dev);
    if (!ctx) 
    {
        fprintf(stderr, "Couldn't get context for %s\n", ibv_get_device_name(target_dev));
        return -3;
    }
    // 4. 分配保护域PD
    struct ibv_pd* pd = ibv_alloc_pd(ctx);
    if (!pd) 
    {
        fprintf(stderr, "Couldn't allocate PD\n");
        return -4;
    }
    // 5. 注册内存区域
    unsigned int mr_size = 16 * 1048576;
    void* mr_buffer = memalign(1024, mr_size);
    struct ibv_mr* mr = ibv_reg_mr(pd, mr_buffer, mr_size, IBV_ACCESS_LOCAL_WRITE);
    if (!mr) 
    {
        fprintf(stderr, "Couldn't register MR\n");
        return -5;
    }
    // 6. 创建CQ
    unsigned int rx_depth = 500;
    struct ibv_cq* cq = ibv_create_cq(ctx, rx_depth, NULL, NULL, 0);
    if (!cq)
    {
        fprintf(stderr, "Couldn't create CQ\n");
        return -6;
    }
    // 7. 创建QP
    struct ibv_qp_init_attr init_attr = {
        .send_cq = cq,
        .recv_cq = cq,
        .cap = {
        .max_send_wr = 1,
        .max_recv_wr = rx_depth,
        .max_send_sge = 2,
        .max_recv_sge = 2
    },
        .qp_type = IBV_QPT_RC
    };
    struct ibv_qp* qp = ibv_create_qp(pd, &init_attr);
    if (!qp)
    {
        fprintf(stderr, "Couldn't create QP\n");
        return -7;
    }
    // 8. 修改QP到INIT
    unsigned char local_port_num = 1;
    struct ibv_qp_attr attr = {
        .qp_state = IBV_QPS_INIT,
        .qp_access_flags = 0,
        .pkey_index = 0,
        .port_num = local_port_num
    };
	
    int ret = ibv_modify_qp(qp,
        &attr, 
        IBV_QP_STATE | 
        IBV_QP_PKEY_INDEX |
        IBV_QP_PORT |
        IBV_QP_ACCESS_FLAGS);
    if (ret != 0) 
    {
        fprintf(stderr, "Failed to modify QP to INIT\n");
        return -8;
    }
    // 查询lid信息
    struct ibv_port_attr port_attr;
    if (ibv_query_port(ctx, local_port_num, &port_attr)) 
    {
        fprintf(stderr, "Couldn't get port info\n");
        return 1;
    }
    // 输出地址
    int local_lid = port_attr.lid;
    int local_qpn = qp->qp_num;
    int local_psn = lrand48() & 0xffffff;
    int local_gidx = 0;
    union ibv_gid local_gid;
    if (ibv_query_gid(ctx, local_port_num, local_gidx, &local_gid))
    {
        fprintf(stderr, "can't read sgid of index %d\n", local_gidx);
        return 1;
    }
    printf("local address lid qpn psn gid: %04x %06x %06x %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n", 
        local_lid, qp->qp_num, local_psn, 
        local_gid.raw[0], local_gid.raw[1], local_gid.raw[2], local_gid.raw[3],
        local_gid.raw[4], local_gid.raw[5], local_gid.raw[6], local_gid.raw[7],
        local_gid.raw[8], local_gid.raw[9], local_gid.raw[10], local_gid.raw[11],
        local_gid.raw[12], local_gid.raw[13], local_gid.raw[14], local_gid.raw[15]);
    // 9. socket获取其他方式交换连接信息,包括:
    // - lid, 16 bit
    // - qpn, 24 bit
    // - psn, 24 bit
    unsigned int remote_lid = 0;
    unsigned int remote_qpn = 0;
    unsigned int remote_psn = 0;
    char remote_gidstr[33];
    union ibv_gid remote_gid;
    printf("input remote address info: ");
    scanf("%x %x %x %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x", 
        &remote_lid, &remote_qpn, &remote_psn, 
        remote_gid.raw + 0, remote_gid.raw + 1, remote_gid.raw + 2, remote_gid.raw + 3, 
        remote_gid.raw + 4, remote_gid.raw + 5, remote_gid.raw + 6, remote_gid.raw + 7, 
        remote_gid.raw + 8, remote_gid.raw + 9, remote_gid.raw + 10, remote_gid.raw + 11, 
        remote_gid.raw + 12, remote_gid.raw + 13, remote_gid.raw + 14, remote_gid.raw + 15);
    // 10. 修改QP状态为RTR
    unsigned char local_sl = 3;
    struct ibv_qp_attr rtr_attr = {
        .qp_state = IBV_QPS_RTR,
        .path_mtu = IBV_MTU_4096,
        .rq_psn = remote_psn,
        .dest_qp_num = remote_qpn,
        .ah_attr = {
            .dlid = (uint16_t)remote_lid,
            .sl = local_sl,
            .src_path_bits = 0,
            .is_global = 0,
            .port_num = local_port_num
        },
        .max_dest_rd_atomic = 1,
        .min_rnr_timer = 12
    };

    if (remote_gid.global.interface_id) 
    {
        rtr_attr.ah_attr.is_global = 1;
        rtr_attr.ah_attr.grh.hop_limit = 1;
        rtr_attr.ah_attr.grh.dgid = remote_gid;
        rtr_attr.ah_attr.grh.sgid_index = local_gidx;
    }
    ret = ibv_modify_qp(qp,
        &rtr_attr,
        IBV_QP_STATE |
        IBV_QP_AV |
        IBV_QP_PATH_MTU |
        IBV_QP_DEST_QPN |
        IBV_QP_RQ_PSN |
        IBV_QP_MAX_DEST_RD_ATOMIC |
        IBV_QP_MIN_RNR_TIMER);
    if (ret != 0) 
    {
        fprintf(stderr, "Failed to modify QP to RTR %s\n", strerror(ret));
        return -11;
    }
    // 11. 修改QP状态为RTS状态
    struct ibv_qp_attr rts_attr = rtr_attr;
    rts_attr.qp_state = IBV_QPS_RTS;
    rts_attr.timeout = 14;
    rts_attr.retry_cnt = 7;
    rts_attr.rnr_retry = 7;
    rts_attr.sq_psn = local_psn;
    rts_attr.max_rd_atomic = 1;
    ret = ibv_modify_qp(qp,
        &rts_attr,
        IBV_QP_STATE |
        IBV_QP_TIMEOUT |
        IBV_QP_RETRY_CNT |
        IBV_QP_RNR_RETRY |
        IBV_QP_SQ_PSN |
        IBV_QP_MAX_QP_RD_ATOMIC);
    if (ret != 0) 
    {
        fprintf(stderr, "Failed to modify QP to RTS\n");
        return -11;
    }
    // 12. 发送数据
    unsigned int total_count = 10000000;
    unsigned int flags = IBV_SEND_SIGNALED;
    struct ibv_sge list[2];
    list[0].length = 16;
    list[0].addr = (uint64_t)mr_buffer;
    list[0].lkey = mr->lkey;
    sprintf((char*)list[0].addr, "%s", "data from rdma ");
    memset((char*)list[0].addr, 'a', list[0].length);
    list[1].length = length;
    list[1].addr = (uint64_t)((char*)mr_buffer + 128);
    memset((char*)list[1].addr, '\0', list[1].length);
    list[1].lkey = mr->lkey;
    for (int i = 0; i < total_count; ++i)
    {
        sprintf((char*)list[1].addr, "%d", i);
        struct ibv_send_wr wr = {
            .wr_id = (uint64_t)i,
            .sg_list = list,
            .num_sge = 2,
            .opcode = IBV_WR_SEND,
            .send_flags = flags,
        };
        struct ibv_send_wr *bad_wr;
        ret = ibv_post_send(qp, &wr, &bad_wr);
        if (ret != 0)
        {
            fprintf(stderr, "ibv_post_send failed %s.\n", strerror(errno));
        }
        // 13. pool cq
        int ne = 0;
        do 
        {
            struct ibv_wc wc[1];
            ne = ibv_poll_cq(cq, 1, wc);
            if (ne < 0) 
            {
                fprintf(stderr, "poll CQ failed %d\n", ne);
                return 1;
            }
        } while (ne < 1);
    }
    return 0;
}

3. 接收端代码

int verbs_receive(int dev_idx)
{
    // 1. 获取设备列表
    struct ibv_device** dev_list = ibv_get_device_list(NULL);
    if (!dev_list) 
    {
        perror("Failed to get IB devices list");
        return -1;
    }
    // 2. 选择设备
    struct ibv_device* target_dev = dev_list[dev_idx];
    if (!target_dev)
    {
        fprintf(stderr, "No IB devices found\n");
        return -2;
    }
    // 3. 打开设备
    struct ibv_context* ctx = ibv_open_device(target_dev);
    if (!ctx) 
    {
        fprintf(stderr, "Couldn't get context for %s\n", ibv_get_device_name(target_dev));
        return -3;
    }
    // 4. 分配保护域PD
    struct ibv_pd* pd = ibv_alloc_pd(ctx);
    if (!pd) 
    {
        fprintf(stderr, "Couldn't allocate PD\n");
        return -4;
    }
    // 5. 注册内存区域
    unsigned int mr_size = 16 * 1048576;
    void* mr_buffer = memalign(1024, mr_size);
    struct ibv_mr* mr = ibv_reg_mr(pd, mr_buffer, mr_size, IBV_ACCESS_LOCAL_WRITE);
    if (!mr) 
    {
        fprintf(stderr, "Couldn't register MR\n");
        return -5;
    }
    // 6. 创建CQ
    unsigned int rx_depth = 5000;
    struct ibv_cq* cq = ibv_create_cq(ctx, rx_depth, NULL, NULL, 0);
    if (!cq)
    {
        fprintf(stderr, "Couldn't create CQ\n");
        return -6;
    }
    // 7. 创建QP
    struct ibv_qp_init_attr init_attr = {
        .send_cq = cq,
        .recv_cq = cq,
        .cap = {
        .max_send_wr = 1,
        .max_recv_wr = rx_depth,
        .max_send_sge = 2,
        .max_recv_sge = 2
    },
        .qp_type = IBV_QPT_RC
    };
    struct ibv_qp* qp = ibv_create_qp(pd, &init_attr);
    if (!qp)
    {
        fprintf(stderr, "Couldn't create QP\n");
        return -7;
    }
    // 8. 修改QP到INIT
    unsigned char local_port_num = 1;
    struct ibv_qp_attr attr = {
        .qp_state = IBV_QPS_INIT,
        .qp_access_flags = 0,
        .pkey_index = 0,
        .port_num = local_port_num
    };
    int ret = ibv_modify_qp(qp,
        &attr, 
        IBV_QP_STATE | 
        IBV_QP_PKEY_INDEX |
        IBV_QP_PORT |
        IBV_QP_ACCESS_FLAGS);
    if (ret != 0) 
    {
        fprintf(stderr, "Failed to modify QP to INIT\n");
        return -8;
    }
    // 9. POST RECV
    struct ibv_sge list = {
        .addr = (uint64_t)mr_buffer,
        .length = mr_size,
        .lkey = mr->lkey
    };
    struct ibv_recv_wr wr = {
        .wr_id = 1,
        .sg_list = &list,
        .num_sge = 1,
    };
    struct ibv_recv_wr *bad_wr;
    for (int i = 0; i < rx_depth; ++i)
    {
        ret = ibv_post_recv(qp, &wr, &bad_wr);
        if (ret < 0)
        {
            fprintf(stderr, "Failed to ibv_post_recv\n");
            return -9;
        }
    }
    // 查询lid信息
    struct ibv_port_attr port_attr;
    if (ibv_query_port(ctx, local_port_num, &port_attr)) 
    {
        fprintf(stderr, "Couldn't get port info\n");
        return 1;
    }
    // 输出地址
    int local_lid = port_attr.lid;
    int local_qpn = qp->qp_num;
    int local_psn = lrand48() & 0xffffff;
    int local_gidx = 0;
    union ibv_gid local_gid;
    if (ibv_query_gid(ctx, local_port_num, local_gidx, &local_gid))
    {
        fprintf(stderr, "can't read sgid of index %d\n", local_gidx);
        return 1;
    }
    printf("local address lid qpn psn gid: %04x %06x %06x %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n", 
        local_lid, qp->qp_num, local_psn,
        local_gid.raw[0], local_gid.raw[1], local_gid.raw[2], local_gid.raw[3],
        local_gid.raw[4], local_gid.raw[5], local_gid.raw[6], local_gid.raw[7],
        local_gid.raw[8], local_gid.raw[9], local_gid.raw[10], local_gid.raw[11],
        local_gid.raw[12], local_gid.raw[13], local_gid.raw[14], local_gid.raw[15]);
    // 连接到发送端
    unsigned int remote_lid = 0;
    unsigned int remote_qpn = 0;
    unsigned int remote_psn = 0;
    union ibv_gid remote_gid;
    printf("input remote address info: ");
    scanf("%x %x %x %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x", 
        &remote_lid, &remote_qpn, &remote_psn, 
        remote_gid.raw + 0, remote_gid.raw + 1, remote_gid.raw + 2, remote_gid.raw + 3, 
        remote_gid.raw + 4, remote_gid.raw + 5, remote_gid.raw + 6, remote_gid.raw + 7, 
        remote_gid.raw + 8, remote_gid.raw + 9, remote_gid.raw + 10, remote_gid.raw + 11, 
        remote_gid.raw + 12, remote_gid.raw + 13, remote_gid.raw + 14, remote_gid.raw + 15);
    // 10. 修改QP为RTR
    unsigned char local_sl = 3;
    struct ibv_qp_attr rtr_attr = {
        .qp_state = IBV_QPS_RTR,
        .path_mtu = IBV_MTU_4096,
        .rq_psn = remote_psn,
        .dest_qp_num = remote_qpn,
        .ah_attr = {
            .dlid = (uint16_t)remote_lid,
            .sl = local_sl,
            .src_path_bits = 0,
            .is_global = 0,
            .port_num = local_port_num
        },
        .max_dest_rd_atomic = 1,
        .min_rnr_timer = 12
    };
    if (remote_gid.global.interface_id) 
    {
        rtr_attr.ah_attr.is_global = 1;
        rtr_attr.ah_attr.grh.hop_limit = 1;
        rtr_attr.ah_attr.grh.dgid = remote_gid;
        rtr_attr.ah_attr.grh.sgid_index = local_gidx;
    }
    ret = ibv_modify_qp(qp,
        &rtr_attr,
        IBV_QP_STATE |
        IBV_QP_AV |
        IBV_QP_PATH_MTU |
        IBV_QP_DEST_QPN |
        IBV_QP_RQ_PSN |
        IBV_QP_MAX_DEST_RD_ATOMIC |
        IBV_QP_MIN_RNR_TIMER);
    if (ret != 0) 
    {
        fprintf(stderr, "Failed to modify QP to RTR %s\n", strerror(ret));
        return -11;
    }
    // 11. 修改QP状态为RTS状态
    struct ibv_qp_attr rts_attr = rtr_attr;
    rts_attr.qp_state = IBV_QPS_RTS;
    rts_attr.timeout = 14;
    rts_attr.retry_cnt = 7;
    rts_attr.rnr_retry = 7;
    rts_attr.sq_psn = local_psn;
    rts_attr.max_rd_atomic = 1;
    ret = ibv_modify_qp(qp,
        &rts_attr,
        IBV_QP_STATE |
        IBV_QP_TIMEOUT |
        IBV_QP_RETRY_CNT |
        IBV_QP_RNR_RETRY |
        IBV_QP_SQ_PSN |
        IBV_QP_MAX_QP_RD_ATOMIC);
    if (ret != 0) 
    {
        fprintf(stderr, "Failed to modify QP to RTS\n");
        return -11;
    }
    unsigned int printGap = 1;
    unsigned int count = 0;
    static uint64_t s_lastTime = 0;
    while (true)
    {
        struct ibv_wc wc[1];
        int ne = ibv_poll_cq(cq, 1, wc);
        if (ne < 0) 
        {
            fprintf(stderr, "poll CQ failed %d\n", ne);
            return 1;
        }
        if (ne == 0)
        {
            continue;
        }
        wc[0].status;
        // 依次处理wc
        if (count++ % printGap == 0)
        {
            printf("recv length %d content %s \n", wc[0].byte_len, mr_buffer);
        }
        // 9. POST RECV
        ret = ibv_post_recv(qp, &wr, &bad_wr);
        if (ret < 0)
        {
            fprintf(stderr, "Failed to ibv_post_recv\n");
            return -9;
        }
    }
    return 0;
}

4. 总控代码

#include <infiniband/verbs.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <malloc.h>
#include <unistd.h>

// verbs_receive

// verbs_send

void printHelp()
{
    printf("./rdma_verbs_example r \n");
    printf("./rdma_verbs_example s length\n");
}

int main(int argc, char* argv[])
{
    if (argc < 2)
    {
        printHelp();
        return -1;
    }
    // 按照真实的环境修改
    int dev_index = 1;
    int is_send = argv[1][0] == 's';
    if (is_send)
    {
        if (argc < 3)
        {
            printHelp();
            return -1;
        }
        unsigned int length = 10240;
        sscanf(argv[2], "%u", &length);
        verbs_send(dev_index, length);
    }
    else
    {
        verbs_receive(dev_index);
    }
    return 0;
}

4. 编译运行

  • 把代码保存到rdma_example.cpp,根据自身的网卡信息,修改main函数中的dev_index;
  • g++ rdma_example.cpp -lverbs -o rdma_example
  • 接收端:./rdma_example r
  • 发送端:./rdma_example s 16384
  • 手动输入对端输出的地址
  • 观察结果

5. 参考资料

  • https://github.com/linux-rdma/rdma-core/tree/master/libibverbs/examples
  • https://docs.nvidia.com/rdma-aware-networks-programming-user-manual-1-7.pdf
  • 3
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值