netmap 收发报文是采用 poll,主要函数是netmap_poll
,我们这里只分析收包过程,发包过程类似。
一、netmap_kring
先来看一个重要的结构netmap_kring
:
struct netmap_kring {
struct netmap_ring *ring;
uint32_t nr_hwcur;
uint32_t nr_hwtail;
// 对应 netmap_ring 的 head, cur 和 tail
uint32_t rhead;
uint32_t rcur;
uint32_t rtail;
uint32_t nkr_num_slots; // 槽位个数
int32_t nkr_hwofs; // 网卡和netmap槽位相差的偏移
NM_SELINFO_T si; /* 等待队列 */
NM_LOCK_T q_lock; /* 队列的锁 */
NM_ATOMIC_T nr_busy; /* 防止并发访问 */
struct netmap_adapter *na;
// 相关的回调函数
int (*nm_sync)(struct netmap_kring *kring, int flags);
int (*nm_notify)(struct netmap_kring *kring, int flags);
};
二、Ring 内部布局
TX/RX 内部布局图
RxRING TxRING
+-----------------+ +-----------------+
| | | |
|XXX free slot XXX| |XXX free slot XXX|
+-----------------+ +-----------------+
head->| owned by user |<-hwcur | not sent to nic |<-hwcur
| | | yet |
+-----------------+ | |
cur->| available to | | |
| user, not read | +-----------------+
| yet | cur->| (being |
| | | prepared) |
| | | |
+-----------------+ + ------ +
tail->| |<-hwtail | |<-hwlease
| (being | ... | | ...
| prepared) | ... | | ...
+-----------------+ ... | | ...
| |<-hwlease +-----------------+
| | tail->| |<-hwtail
| | | |
| | | |
| | | |
+-----------------+ +-----------------+
三、netmap_poll 代码分析
int netmap_poll(struct netmap_priv_d *priv, int events, NM_SELRECORD_T *sr)
{
struct netmap_adapter *na;
struct netmap_kring *kring;
struct netmap_ring *ring;
#define want_rx want[NR_RX]
struct mbq q;
enum txrx t;
mbq_init(&q);
mb();
if (want_rx) {
want_rx = 0;
t = NR_RX;
// 遍历所有收包队列,一旦有队列是空的,那就可以收包了
for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
kring = &NMR(na, t)[i]; /* 取出一个收包队列 */
// 如果队列的 cur 和 tail 指针在同一位置,说明这个队列还没有数据,want_rx 置1,准备收包
if (kring->ring->cur == kring->ring->tail || kring->rhead != kring->ring->head) {
want_rx = 1;
}
}
if (!want_rx) // 队列中还有数据
revents |= events & (POLLIN | POLLRDNORM);
}
if (want_rx) {
do_retry_rx:
for (i = priv->np_qfirst[NR_RX]; i < priv->np_qlast[NR_RX]; i++) {
int found = 0;
kring = &na->rx_rings[i];
ring = kring->ring;
if (unlikely(nm_kr_tryget(kring, 1, &revents))) // 看一下这个队列是否可以操作
continue;
kring->nr_kflags &= ~NR_FORWARD;
// nm_sync 指向 na->nm_rxsync,对于 ixgbe 是 ixgbe_netmap_rxsync 函数
if (kring->nm_sync(kring, 0))
revents |= POLLERR;
else
nm_sync_finalize(kring);
send_down |= (kring->nr_kflags & NR_FORWARD);
// rcur 和 rtail 如果不在同一位置说明队列中有数据
found = kring->rcur != kring->rtail;
nm_kr_put(kring);
if (found) {
revents |= want_rx;
retry_rx = 0; // 如果队列中有数据,则置标志位为 0
kring->nm_notify(kring, 0); // 还记得前一篇文章中提到的这个函数么,用于唤醒等待队列
}
}
if (retry_rx && sr) {
// nm_os_selrecord 调用的是 poll_wait() 函数
nm_os_selrecord(sr, check_all_rx ?
&na->si[NR_RX] : &na->rx_rings[priv->np_qfirst[NR_RX]].si);
}
if (send_down > 0 || retry_rx) {
retry_rx = 0;
if (send_down)
goto flush_tx;
else
goto do_retry_rx; // 如果 retry_rx 标志位非 0,说明接收队列都还满着,继续遍历队列
}
}
}
四、ixgbe_netmap_rxsync 分析
我们来看看收包的函数 ixgbe_netmap_rxsync
static int ixgbe_netmap_rxsync(struct netmap_kring *kring, int flags)
{
struct netmap_adapter *na = kring->na;
struct ifnet *ifp = na->ifp;
struct netmap_ring *ring = kring->ring;
u_int ring_nr = kring->ring_id;
u_int nm_i; // netmap队列中槽位的索引
u_int nic_i; // 网卡队列中槽位的索引
u_int n;
u_int const lim = kring->nkr_num_slots - 1;
u_int const head = kring->rhead; // 队列的有效槽位指针
struct SOFTC_T *adapter = netdev_priv(ifp); // 网卡适配器
struct ixgbe_ring *rxr = NM_IXGBE_RX_RING(adapter, ring_nr); // 网卡接收队列
rmb(); // 内存屏障
if (netmap_no_pendintr || force_update) {
uint16_t slot_flags = kring->nkr_slot_flags;
nic_i = rxr->next_to_clean;
// 通过网卡队列槽位索引计算netmap队列槽位索引
nm_i = netmap_idx_n2k(kring, nic_i);
for (n = 0; ; n++) {
union ixgbe_adv_rx_desc *curr = NM_IXGBE_RX_DESC(rxr, nic_i); // ixgbe 接收队列信息描述结构
uint32_t staterr = le32toh(curr->wb.upper.status_error);
if ((staterr & IXGBE_RXD_STAT_DD) == 0) // 查看 ixgbe 接收队列是否准备好
break;
ring->slot[nm_i].len = le16toh(curr->wb.upper.length);
ring->slot[nm_i].flags = (!(staterr & IXGBE_RXD_STAT_EOP) ? NS_MOREFRAG |
slot_flags:slot_flags);
nm_i = nm_next(nm_i, lim); // 指向下一个netmap槽位索引
nic_i = nm_next(nic_i, lim); // 指向下一个网卡槽位索引
}
if (n) { // 更新指针位置
rxr->next_to_clean = nic_i;
kring->nr_hwtail = nm_i;
}
}
nm_i = kring->nr_hwcur;
if (nm_i != head) { // 如果槽位索引不相同
nic_i = netmap_idx_k2n(kring, nm_i);
for (n = 0; nm_i != head; n++) {
struct netmap_slot *slot = &ring->slot[nm_i];
uint64_t paddr;
void *addr = PNMB(na, slot, &paddr); // 计算netmap队列槽位的地址
union ixgbe_adv_rx_desc *curr = NM_IXGBE_RX_DESC(rxr, nic_i);
curr->read.pkt_addr = htole64(paddr); // 网卡报文地址与netmap槽位地址关联
nm_i = nm_next(nm_i, lim);
nic_i = nm_next(nic_i, lim);
}
kring->nr_hwcur = head;
rxr->next_to_use = nic_i;
wmb();
// 保证最后一个槽位为空的,内核态使用
nic_i = nm_prev(nic_i, lim);
IXGBE_WRITE_REG(&adapter->hw, IXGBE_RDT(rxr->reg_idx), nic_i);
}
return 0;
}
现在队列里面已经有数据了,那么我们来看看 netmap 是如何映射到用户态的吧
五、linux_netmap_mmap
static int linux_netmap_mmap(struct file *f, struct vm_area_struct *vma)
{
unsigned long off;
u_int memsize, memflags;
struct netmap_priv_d *priv = f->private_data;
struct netmap_adapter *na = priv->np_na;
mb();
// na->nm_mem 指向全局的 nm_mem
error = netmap_mem_get_info(na->nm_mem, &memsize, &memflags, NULL);
if (error)
return -error;
// 中间部分我去掉了一些关于有效性的判断
off = vma->vm_pgoff << PAGE_SHIFT;
if (off + (vma->vm_end - vma->vm_start) > memsize)
return -EINVAL;
if (memflags & NETMAP_MEM_IO) {
vm_ooffset_t pa;
pa = netmap_mem_ofstophys(na->nm_mem, 0);
if (pa == 0)
return -EINVAL;
// remap_pfn_range 函数的作用就是映射内核态内存到用户态
return remap_pfn_range(vma, vma->vm_start,
pa >> PAGE_SHIFT, // vm_start 为映射用户态空间起始位置
vma->vm_end - vma->vm_start, // vm_end 是结束位置
vma->vm_page_prot);
}
return 0;
}
全局的内存分配器nm_mem
struct netmap_mem_d nm_mem = {
.pools = {
[NETMAP_IF_POOL] = { // netmap_if
.name = "netmap_if",
.objminsize = sizeof(struct netmap_if),
.objmaxsize = 4096,
.nummin = 10,
.nummax = 10000,
},
[NETMAP_RING_POOL] = { // netmap_ring
.name = "netmap_ring",
.objminsize = sizeof(struct netmap_ring),
.objmaxsize = 32*PAGE_SIZE,
.nummin = 2,
.nummax = 1024,
},
[NETMAP_BUF_POOL] = { // netmap_buf
.name = "netmap_buf",
.objminsize = 64,
.objmaxsize = 65536,
.nummin = 4,
.nummax = 1000000,
},
},
.nm_id = 1,
.nm_grp = -1,
.prev = &nm_mem,
.next = &nm_mem,
.ops = &netmap_mem_global_ops
};
用户态如何得到这些东东呢?
六、用户态映射过程
简单来看是这样的
fd = open("/dev/netmap", 0);
strcpy(req.nr_name, "ethX");
ioctl(fd, NIOCREGIF, &req);
mem = mmap(NULL, req.nr_memsize, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0);
nifp = NETMAP_IF(mem, req.nr_offset); // 用户态通过 offset 得到 struct netmap_if