用户态驱动程序ixy学习

ixy介绍

ixy是一个简单的用户空间数据包处理框架。它独占控制网络适配器,并在用户空间中实现整个驱动程序。它的架构类似于DPDK和Snabb,与netmap,pfq,pf_ring或XDP(所有这些都依赖于内核组件)等框架(看似相似)完全不同。事实上,阅读 DPDK 和 Snabb 驱动程序对于更好地理解英特尔 82599 数据表的某些部分至关重要。

与netmap,pfq,pf_ring的不同点如下:

ixy相关资料

ixy的Git仓库为 https://gitee.com/RSPwFPGAs/ixy.git

ixy的介绍文档为User Space Network Drivers (tum.de)

ixy示例ixy-fwd

ixy工程APP目录下的ixy-fwd示例程序的调用流程如下所示:

PCIE设备的用户态驱动初始化代码走查如下:

ixy代码展示

app目录下main函数所在文件ixy-fwd.c

const int BATCH_SIZE = 32;

static void forward(struct ixy_device* rx_dev, uint16_t rx_queue, struct ixy_device* tx_dev, uint16_t tx_queue) {
        struct pkt_buf* bufs[BATCH_SIZE];
        uint32_t num_rx = ixy_rx_batch(rx_dev, rx_queue, bufs, BATCH_SIZE);
        if (num_rx > 0) {
                // touch all packets, otherwise it's a completely unrealistic workload if the packet just stays in L3
                for (uint32_t i = 0; i < num_rx; i++) {
                        bufs[i]->data[1]++;
                }
                uint32_t num_tx = ixy_tx_batch(tx_dev, tx_queue, bufs, num_rx);
                // there are two ways to handle the case that packets are not being sent out:
                // either wait on tx or drop them; in this case it's better to drop them, otherwise we accumulate latency
                for (uint32_t i = num_tx; i < num_rx; i++) {
                        pkt_buf_free(bufs[i]);
                }
        }
}

int main(int argc, char* argv[]) {
        if (argc != 3) {
                printf("%s forwards packets between two ports.\n", argv[0]);
                printf("Usage: %s <pci bus id2> <pci bus id1>\n", argv[0]);
                return 1;
        }

        struct ixy_device* dev1 = ixy_init(argv[1], 1, 1, -1);
        struct ixy_device* dev2 = ixy_init(argv[2], 1, 1, 0);

        while (true) {
                forward(dev1, 0, dev2, 0);
                forward(dev2, 0, dev1, 0);
        }                                
}

/* ixy-master/src/app/ixy-fwd.c */

APP调用API接口实现如下:

struct ixy_device* ixy_init(const char* pci_addr, uint16_t rx_queues, uint16_t tx_queues, int interrupt_timeout) {
        // Read PCI configuration space
        // For VFIO, we could access the config space another way
        // (VFIO_PCI_CONFIG_REGION_INDEX). This is not needed, though, because
        // every config file should be world-readable, and here we
        // only read the vendor and device id.
        int config = pci_open_resource(pci_addr, "config", O_RDONLY);
        uint16_t vendor_id = read_io16(config, 0);
        uint16_t device_id = read_io16(config, 2);
        uint32_t class_id = read_io32(config, 8) >> 24;
        close(config);
        if (class_id != 2) {
                error("Device %s is not a NIC", pci_addr);
        }
        if (vendor_id == 0x1af4 && device_id >= 0x1000) {
                return virtio_init(pci_addr, rx_queues, tx_queues);
        } else {
                // Our best guess is to try ixgbe
                return ixgbe_init(pci_addr, rx_queues, tx_queues, interrupt_timeout);
        }
}

/* ixy-master/src/driver/device.c */

// Public stubs that forward the calls to the driver-specific implementations
static inline uint32_t ixy_rx_batch(struct ixy_device* dev, uint16_t queue_id, struct pkt_buf* bufs[], uint32_t num_bufs) {
        return dev->rx_batch(dev, queue_id, bufs, num_bufs);
}

/* ixy-master/src/driver/device.h */

API接口依赖的用户态驱动程序示例代码如下:

struct ixy_device* ixgbe_init(const char* pci_addr, uint16_t rx_queues, uint16_t tx_queues, int interrupt_timeout) {
        if (getuid()) {
                warn("Not running as root, this will probably fail");
        }
        if (rx_queues > MAX_QUEUES) {
                error("cannot configure %d rx queues: limit is %d", rx_queues, MAX_QUEUES);
        }
        if (tx_queues > MAX_QUEUES) {
                error("cannot configure %d tx queues: limit is %d", tx_queues, MAX_QUEUES);
        }

        // Allocate memory for the ixgbe device that will be returned
        struct ixgbe_device* dev = (struct ixgbe_device*) malloc(sizeof(struct ixgbe_device));
        dev->ixy.pci_addr = strdup(pci_addr);

        // Check if we want the VFIO stuff
        // This is done by checking if the device is in an IOMMU group.
        char path[PATH_MAX];
        snprintf(path, PATH_MAX, "/sys/bus/pci/devices/%s/iommu_group", pci_addr);
        struct stat buffer;
        dev->ixy.vfio = stat(path, &buffer) == 0;
        if (dev->ixy.vfio) {
                // initialize the IOMMU for this device
                dev->ixy.vfio_fd = vfio_init(pci_addr);
                if (dev->ixy.vfio_fd < 0) {
                        error("could not initialize the IOMMU for device %s", pci_addr);
                }
        }
        dev->ixy.driver_name = driver_name;
        dev->ixy.num_rx_queues = rx_queues;
        dev->ixy.num_tx_queues = tx_queues;
        dev->ixy.rx_batch = ixgbe_rx_batch;
        dev->ixy.tx_batch = ixgbe_tx_batch;
        dev->ixy.read_stats = ixgbe_read_stats;
        dev->ixy.set_promisc = ixgbe_set_promisc;
        dev->ixy.get_link_speed = ixgbe_get_link_speed;
        dev->ixy.get_mac_addr = ixgbe_get_mac_addr;
        dev->ixy.set_mac_addr = ixgbe_set_mac_addr;
        dev->ixy.interrupts.interrupts_enabled = interrupt_timeout != 0;
        // 0x028 (10ys) => 97600 INT/s
        dev->ixy.interrupts.itr_rate = 0x028;
        dev->ixy.interrupts.timeout_ms = interrupt_timeout;

        if (!dev->ixy.vfio && interrupt_timeout != 0) {
                warn("Interrupts requested but VFIO not available: Disabling Interrupts!");
                dev->ixy.interrupts.interrupts_enabled = false;
        }

        // Map BAR0 region
        if (dev->ixy.vfio) {
                debug("mapping BAR0 region via VFIO...");
                dev->addr = vfio_map_region(dev->ixy.vfio_fd, VFIO_PCI_BAR0_REGION_INDEX);
                // initialize interrupts for this device
                setup_interrupts(dev);
        } else {
                debug("mapping BAR0 region via pci file...");
                dev->addr = pci_map_resource(pci_addr);
        }
        dev->rx_queues = calloc(rx_queues, sizeof(struct ixgbe_rx_queue) + sizeof(void*) * MAX_RX_QUEUE_ENTRIES);
        dev->tx_queues = calloc(tx_queues, sizeof(struct ixgbe_tx_queue) + sizeof(void*) * MAX_TX_QUEUE_ENTRIES);
        reset_and_init(dev);
        return &dev->ixy;
}
/* ixy-master/src/driver/ixgbe.c*/
uint32_t ixgbe_rx_batch(struct ixy_device* ixy, uint16_t queue_id, struct pkt_buf* bufs[], uint32_t num_bufs) {
        struct ixgbe_device* dev = IXY_TO_IXGBE(ixy);

        struct interrupt_queues* interrupt = NULL;
        bool interrupts_enabled = ixy->interrupts.interrupts_enabled;

        if (interrupts_enabled) {
                interrupt = &ixy->interrupts.queues[queue_id];
        }

        if (interrupts_enabled && interrupt->interrupt_enabled) {
                vfio_epoll_wait(interrupt->vfio_epoll_fd, 10, dev->ixy.interrupts.timeout_ms);
        }

        struct ixgbe_rx_queue* queue = ((struct ixgbe_rx_queue*) (dev->rx_queues)) + queue_id;
        uint16_t rx_index = queue->rx_index; // rx index we checked in the last run of this function
        uint16_t last_rx_index = rx_index; // index of the descriptor we checked in the last iteration of the loop
        uint32_t buf_index;
        for (buf_index = 0; buf_index < num_bufs; buf_index++) {
                // rx descriptors are explained in 7.1.5
                volatile union ixgbe_adv_rx_desc* desc_ptr = queue->descriptors + rx_index;
                uint32_t status = desc_ptr->wb.upper.status_error;
                if (status & IXGBE_RXDADV_STAT_DD) {
                        if (!(status & IXGBE_RXDADV_STAT_EOP)) {
                                error("multi-segment packets are not supported - increase buffer size or decrease MTU");
                        }
                        // got a packet, read and copy the whole descriptor
                        union ixgbe_adv_rx_desc desc = *desc_ptr;
                        struct pkt_buf* buf = (struct pkt_buf*) queue->virtual_addresses[rx_index];
                        buf->size = desc.wb.upper.length;
                        // this would be the place to implement RX offloading by translating the device-specific flags
                        // to an independent representation in the buf (similiar to how DPDK works)
                        // need a new mbuf for the descriptor
                        struct pkt_buf* new_buf = pkt_buf_alloc(queue->mempool);
                        if (!new_buf) {
                                // we could handle empty mempools more gracefully here, but it would be quite messy...
                                // make your mempools large enough
                                error("failed to allocate new mbuf for rx, you are either leaking memory or your mempool is too small");
                        }
                        // reset the descriptor
                        desc_ptr->read.pkt_addr = new_buf->buf_addr_phy + offsetof(struct pkt_buf, data);
                        desc_ptr->read.hdr_addr = 0; // this resets the flags
                        queue->virtual_addresses[rx_index] = new_buf;
                        bufs[buf_index] = buf;
                        // want to read the next one in the next iteration, but we still need the last/current to update RDT later
                        last_rx_index = rx_index;
                        rx_index = wrap_ring(rx_index, queue->num_entries);
                } else {
                        break;
                }
        }
        if (rx_index != last_rx_index) {
                // tell hardware that we are done
                // this is intentionally off by one, otherwise we'd set RDT=RDH if we are receiving faster than packets are coming in
                // RDT=RDH means queue is full
                set_reg32(dev->addr, IXGBE_RDT(queue_id), last_rx_index);
                queue->rx_index = rx_index;
        }

        if (interrupts_enabled) {
                interrupt->rx_pkts += buf_index;

                if ((interrupt->instr_counter++ & 0xFFF) == 0) {
                        bool int_en = interrupt->interrupt_enabled;
                        uint64_t diff = monotonic_time() - interrupt->last_time_checked;
                        if (diff > interrupt->interval) {
                                // every second
                                check_interrupt(interrupt, diff, buf_index, num_bufs);
                        }

                        if (int_en != interrupt->interrupt_enabled) {
                                if (interrupt->interrupt_enabled) {
                                        enable_interrupt(dev, queue_id);
                                } else {
                                        disable_interrupt(dev, queue_id);
                                }
                        }
                }
        }

        return buf_index; // number of packets stored in bufs; buf_index points to the next index
}

/* ixy-master/src/driver/ixgbe.c */
// section 1.8.1 and 7.2
// we control the tail, hardware the head
// huge performance gains possible here by sending packets in batches - writing to TDT for every packet is not efficient
// returns the number of packets transmitted, will not block when the queue is full
uint32_t ixgbe_tx_batch(struct ixy_device* ixy, uint16_t queue_id, struct pkt_buf* bufs[], uint32_t num_bufs) {
        struct ixgbe_device* dev = IXY_TO_IXGBE(ixy);
        struct ixgbe_tx_queue* queue = ((struct ixgbe_tx_queue*)(dev->tx_queues)) + queue_id;
        // the descriptor is explained in section 7.2.3.2.4
        // we just use a struct copy & pasted from intel, but it basically has two formats (hence a union):
        // 1. the write-back format which is written by the NIC once sending it is finished this is used in step 1
        // 2. the read format which is read by the NIC and written by us, this is used in step 2

        uint16_t clean_index = queue->clean_index; // next descriptor to clean up

        // step 1: clean up descriptors that were sent out by the hardware and return them to the mempool
        // start by reading step 2 which is done first for each packet
        // cleaning up must be done in batches for performance reasons, so this is unfortunately somewhat complicated
        while (true) {
                // figure out how many descriptors can be cleaned up
                int32_t cleanable = queue->tx_index - clean_index; // tx_index is always ahead of clean (invariant of our queue)
                if (cleanable < 0) { // handle wrap-around
                        cleanable = queue->num_entries + cleanable;
                }
                if (cleanable < TX_CLEAN_BATCH) {
                        break;
                }
                // calculcate the index of the last transcriptor in the clean batch
                // we can't check all descriptors for performance reasons
                int32_t cleanup_to = clean_index + TX_CLEAN_BATCH - 1;
                if (cleanup_to >= queue->num_entries) {
                        cleanup_to -= queue->num_entries;
                }
                volatile union ixgbe_adv_tx_desc* txd = queue->descriptors + cleanup_to;
                uint32_t status = txd->wb.status;
                // hardware sets this flag as soon as it's sent out, we can give back all bufs in the batch back to the mempool
                if (status & IXGBE_ADVTXD_STAT_DD) {
                        int32_t i = clean_index;
                        while (true) {
                                struct pkt_buf* buf = queue->virtual_addresses[i];
                                pkt_buf_free(buf);
                                if (i == cleanup_to) {
                                        break;
                                }
                                i = wrap_ring(i, queue->num_entries);
                        }
                        // next descriptor to be cleaned up is one after the one we just cleaned
                        clean_index = wrap_ring(cleanup_to, queue->num_entries);
                } else {
                        // clean the whole batch or nothing; yes, this leaves some packets in
                        // the queue forever if you stop transmitting, but that's not a real concern
                        break;
                }
        }
        queue->clean_index = clean_index;

        // step 2: send out as many of our packets as possible
        uint32_t sent;
        for (sent = 0; sent < num_bufs; sent++) {
                uint32_t next_index = wrap_ring(queue->tx_index, queue->num_entries);
                // we are full if the next index is the one we are trying to reclaim
                if (clean_index == next_index) {
                        break;
                }
                struct pkt_buf* buf = bufs[sent];
                // remember virtual address to clean it up later
                queue->virtual_addresses[queue->tx_index] = (void*) buf;
                volatile union ixgbe_adv_tx_desc* txd = queue->descriptors + queue->tx_index;
                queue->tx_index = next_index;
                // NIC reads from here
                txd->read.buffer_addr = buf->buf_addr_phy + offsetof(struct pkt_buf, data);
                // always the same flags: one buffer (EOP), advanced data descriptor, CRC offload, data length
                txd->read.cmd_type_len =
                        IXGBE_ADVTXD_DCMD_EOP | IXGBE_ADVTXD_DCMD_RS | IXGBE_ADVTXD_DCMD_IFCS | IXGBE_ADVTXD_DCMD_DEXT | IXGBE_ADVTXD_DTYP_DATA | buf->size;
                // no fancy offloading stuff - only the total payload length
                // implement offloading flags here:
                //      * ip checksum offloading is trivial: just set the offset
                //      * tcp/udp checksum offloading is more annoying, you have to precalculate the pseudo-header checksum
                txd->read.olinfo_status = buf->size << IXGBE_ADVTXD_PAYLEN_SHIFT;
        }
        // send out by advancing tail, i.e., pass control of the bufs to the nic
        // this seems like a textbook case for a release memory order, but Intel's driver doesn't even use a compiler barrier here
        set_reg32(dev->addr, IXGBE_TDT(queue_id), queue->tx_index);
        return sent;
}

/* ixy-master/src/driver/ixgbe.c */
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值