af_xdp 创建过程分析
- 目前 af_xdp 的库代码已经从内核中移动到了 libxdp
- 以 BPF examples 中的 AF_XDP-example 为例
- 本文使用的内核代码版本为 6.6.35(下载地址)
libxdp
数据结构
umem
struct xsk_umem {
// 当调用 xsk_umem__create_with_fd 时,会将创建的 fill ring 和 comp ring 保存到 fill_save 和 comp_save 中
// 当调用 xsk_socket__create 时,会自动将 fill_save 和 comp_save 传给 xsk_socket__create_shared,xsk_socket__create_shared 执行成功后,会将 fill_save 和 comp_save 设置为 NULL
struct xsk_ring_prod *fill_save;
struct xsk_ring_cons *comp_save;
// 指向分配给 umem 的地址
char *umem_area;
// umem的配置信息
struct xsk_umem_config config;
// 套接字的文件描述符,可能是新创建套接字的,也可能是传入的
int fd;
// 引用次数,xsk_umem__create_with_fd 产生的引用次数为 0
int refcount;
// 链表,上下文(ctx),每一个 netns_cookie、ifindex、queue_id 确定一个上下文,同一个上下文内 fill ring 和 comp ring 共享
struct list_head ctx_list;
// 标记 fd 所对应的套接字是否有 rx_ring 或者 tx_ring
bool rx_ring_setup_done;
bool tx_ring_setup_done;
};
xsk_ring_prod(生产者队列) 和 xsk_ring_cons (消费者队列)
#define DEFINE_XSK_RING(name) \
struct name { \
__u32 cached_prod; \
__u32 cached_cons; \
__u32 mask; \
__u32 size; \
__u32 *producer; \
__u32 *consumer; \
void *ring; \
__u32 *flags; \
}
DEFINE_XSK_RING(xsk_ring_prod);
DEFINE_XSK_RING(xsk_ring_cons);
xsk_umem_config
struct xsk_umem_config {
// fill ring 的大小
__u32 fill_size;
// comp ring 的大小
__u32 comp_size;
// 帧大小
__u32 frame_size;
// 帧前面预留的空间,如果设置了,那么报文数据将不是从每个帧的起始位置开始存储
__u32 frame_headroom;
__u32 flags;
};
xsk_ctx
struct xsk_ctx {
struct xsk_ring_prod *fill;
struct xsk_ring_cons *comp;
struct xsk_umem *umem;
__u32 queue_id;
int refcount;
int ifindex;
__u64 netns_cookie;
int xsks_map_fd;
struct list_head list;
struct xdp_program *xdp_prog;
int refcnt_map_fd;
char ifname[IFNAMSIZ];
};
函数
xsk_umem__create
/**
* xsk_umem__create - 创建一个 umem
* @umem_ptr: umem 指针(如果创建成功,这个指针将指向创建好的 umem)
* @umem_area: umem 映射到用户空间的地址(用户使用 mmap 等方式分配的内存地址)
* @size: umem 映射到用户空间的大小(用户使用 mmap 等方式分配的内存大小,单位字节)
* @fill: fill_ring (创建 umem 会自动创建一对 fill_ring 和 completion_ring,这里用于接收)
* @comp: completion_ring(创建 umem 会自动创建一对 fill_ring 和 completion_ring,这里用于接收)
* @usr_config: 用户配置(如果设置为NULL,则载入默认配置)
*/
int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area,
__u64 size, struct xsk_ring_prod *fill,
struct xsk_ring_cons *comp,
const struct xsk_umem_config *usr_config)
{
// 将套接字 fd 设置为 -1(xsk_umem__create_with_fd 将会创建一个套接字),其他值原封不动传入
return xsk_umem__create_with_fd(umem_ptr, -1, umem_area, size,
fill, comp, usr_config);
}
xsk_umem__create_with_fd
/**
* xsk_umem__create_with_fd - 根据给定的套接字 fd 创建 umem
* @umem_ptr: umem 指针(如果创建成功,这个指针将指向创建好的 umem)
* @fd: 套接字 fd(如果<0,会创建一个新的套接字;否则会利用当前套接字fd,并赋值给 umem.fd)
* @umem_area: umem 映射到用户空间的地址(用户使用 mmap 等方式分配的内存地址)
* @size: umem 映射到用户空间的大小(用户使用 mmap 等方式分配的内存大小,单位字节)
* @fill: fill_ring (创建 umem 会自动创建一对 fill_ring 和 completion_ring,这里用于接收)
* @comp: completion_ring(创建 umem 会自动创建一对 fill_ring 和 completion_ring,这里用于接收)
* @usr_config: 用户配置(如果设置为NULL,则载入默认配置)
*/
int xsk_umem__create_with_fd(struct xsk_umem **umem_ptr, int fd,
void *umem_area, __u64 size,
struct xsk_ring_prod *fill,
struct xsk_ring_cons *comp,
const struct xsk_umem_config *usr_config)
{
// 用于注册 umem 信息,与内核交互的结构体,
struct xdp_umem_reg mr;
struct xsk_umem *umem;
int err;
// 参数检查
if (!umem_area || !umem_ptr || !fill || !comp)
return -EFAULT;
if (!size && !xsk_page_aligned(umem_area))
return -EINVAL;
umem = calloc(1, sizeof(*umem));
if (!umem)
return -ENOMEM;
umem->fd = fd < 0 ? socket(AF_XDP, SOCK_RAW, 0) : fd;
if (umem->fd < 0) {
err = -errno;
goto out_umem_alloc;
}
umem->umem_area = umem_area;
INIT_LIST_HEAD(&umem->ctx_list);
// 将 usr_config 复制到 umem->config 中(如果usr_config为NULL,则载入默认配置到 umem->config 中)
xsk_set_umem_config(&umem->config, usr_config);
memset(&mr, 0, sizeof(mr));
mr.addr = (uintptr_t)umem_area;
mr.len = size;
mr.chunk_size = umem->config.frame_size;
mr.headroom = umem->config.frame_headroom;
mr.flags = umem->config.flags;
// 将 umem 注册给内核中对应的套接字 umem->fd(内核中具体行为稍后分析)
err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr));
if (err) {
err = -errno;
goto out_socket;
}
// 创建 fill_ring 和 completion_ring,绑定到 umem->fd 的套接字,使用 umem-> config 的信息
err = xsk_create_umem_rings(umem, umem->fd, fill, comp);
if (err)
goto out_socket;
// 暂存创建的 fill_ring 和 completion_ring
umem->fill_save = fill;
umem->comp_save = comp;
// 修改指针,指向创建的 umem
*umem_ptr = umem;
return 0;
out_socket:
close(umem->fd);
out_umem_alloc:
free(umem);
return err;
}
xsk_create_umem_rings
/**
* xsk_create_umem_rings() - 创建 fill_ring 和 completion_ring
* @umem: 只是使用了 umem-> config 的信息
* @fd: setsocket 时使用的套接字 fd,也就是创建 fill ring 和 completion ring 的套接字 fd
* @fill: fill_ring(用于接收)
* @comp: completion_ring(用于接收)
*/
static int xsk_create_umem_rings(struct xsk_umem *umem, int fd,
struct xsk_ring_prod *fill,
struct xsk_ring_cons *comp)
{
// 获取偏移值,偏移值都是相对于结构体的起始位置的偏移
struct xdp_mmap_offsets off;
void *map;
int err;
// 设置 fill ring 的大小,设置后内核会给 fd 对应的套接字分配 fill ring 的结构体
err = setsockopt(fd, SOL_XDP, XDP_UMEM_FILL_RING,
&umem->config.fill_size,
sizeof(umem->config.fill_size));
if (err)
return -errno;
// 设置 completion ring 的大小,设置后内核会给 fd 对应的套接字分配 completion ring 的结构体
err = setsockopt(fd, SOL_XDP, XDP_UMEM_COMPLETION_RING,
&umem->config.comp_size,
sizeof(umem->config.comp_size));
if (err)
return -errno;
// 获取各个ring中各个字段的偏移值
err = xsk_get_mmap_offsets(fd, &off);
if (err)
return -errno;
// 将内核中分配的 fill ring 的地址,映射到 map 中(现在 map 就相当于 fd 的 fill_ring 结构体的起始地址)
map = mmap(NULL, off.fr.desc + umem->config.fill_size * sizeof(__u64),
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
XDP_UMEM_PGOFF_FILL_RING);
if (map == MAP_FAILED)
return -errno;
// 设置用户态维护的 fill 结构体,其中和内核态 fill_ring 相关的部分使用偏移
fill->mask = umem->config.fill_size - 1;
fill->size = umem->config.fill_size;
fill->producer = map + off.fr.producer;
fill->consumer = map + off.fr.consumer;
fill->flags = map + off.fr.flags;
fill->ring = map + off.fr.desc;
fill->cached_cons = umem->config.fill_size;
// 将内核中分配的 completion ring 的地址,映射到 map 中(现在 map 就相当于 fd 的 completion_ring 结构体的起始地址)
map = mmap(NULL, off.cr.desc + umem->config.comp_size * sizeof(__u64),
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
XDP_UMEM_PGOFF_COMPLETION_RING);
if (map == MAP_FAILED) {
err = -errno;
goto out_mmap;
}
// 设置用户态维护的 completion 结构体,其中和内核态 completion_ring 相关的部分使用偏移
comp->mask = umem->config.comp_size - 1;
comp->size = umem->config.comp_size;
comp->producer = map + off.cr.producer;
comp->consumer = map + off.cr.consumer;
comp->flags = map + off.cr.flags;
comp->ring = map + off.cr.desc;
return 0;
out_mmap:
munmap(map, off.fr.desc + umem->config.fill_size * sizeof(__u64));
return err;
}
xsk_socket__create
/**
* xsk_socket__create - 创建一个 AF_XDP 套接字
* @xsk_ptr: 用于接收创建的套接字
* @ifname: 网卡名称
* @queue_id: 网卡队列ID
* @umem: 使用 xsk_umem__create 创建的 umem
* @rx: rx ring(如果不为 NULL,则创建 rx ring,配置 rx)
* @tx: tx ring(如果不为 NULL,则创建 tx ring,配置 tx)
* @usr_config: 配置
*/
int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
__u32 queue_id, struct xsk_umem *umem,
struct xsk_ring_cons *rx, struct xsk_ring_prod *tx,
const struct xsk_socket_config *usr_config)
{
if (!umem)
return -EFAULT;
// xsk_socket__create_shared 需要多传 fill ring 和 rx ring,这里将暂存在 umem 中的 fill ring 和 rx ring 传入
return xsk_socket__create_shared(xsk_ptr, ifname, queue_id, umem,
rx, tx, umem->fill_save,
umem->comp_save, usr_config);
}
xsk_socket__create_shared
/**
* xsk_socket__create_shared - 创建一个 AF_XDP 套接字
* @xsk_ptr: 用于接收创建的套接字
* @ifname: 网卡名称
* @queue_id: 网卡队列ID
* @umem: 使用 xsk_umem__create 创建的 umem
* @rx: rx ring(如果不为 NULL,则创建 rx ring,配置 rx)
* @tx: tx ring(如果不为 NULL,则创建 tx ring,配置 tx)
* @fill: 如果没有对应 ctx 的话,fill 不能为 NULL(要用于创建 ctx)
* @comp: 如果没有对应 ctx 的话,comp 不能为 NULL(要用于创建 ctx)
* @usr_config: 配置
*/
int xsk_socket__create_shared(struct xsk_socket **xsk_ptr,
const char *ifname,
__u32 queue_id, struct xsk_umem *umem,
struct xsk_ring_cons *rx,
struct xsk_ring_prod *tx,
struct xsk_ring_prod *fill,
struct xsk_ring_cons *comp,
const struct xsk_socket_config *usr_config)
{
bool rx_setup_done = false, tx_setup_done = false;
void *rx_map = NULL, *tx_map = NULL;
struct sockaddr_xdp sxdp = {};
struct xdp_mmap_offsets off;
struct xsk_socket *xsk;
struct xsk_ctx *ctx;
int err, ifindex;
__u64 netns_cookie;
socklen_t optlen;
bool unmap;
if (!umem || !xsk_ptr || !(rx || tx))
return -EFAULT;
xsk = calloc(1, sizeof(*xsk));
if (!xsk)
return -ENOMEM;
// 将 usr_config 拷贝到 xsk->config,如果为 NULL,则载入默认参数。此外,还进行参数检查。
err = xsk_set_xdp_socket_config(&xsk->config, usr_config);
if (err)
goto out_xsk_alloc;
ifindex = if_nametoindex(ifname);
if (!ifindex) {
err = -errno;
goto out_xsk_alloc;
}
// 刚刚创建的 umem 的 umem->refcount = 0
if (umem->refcount++ > 0) {
xsk->fd = socket(AF_XDP, SOCK_RAW, 0);
if (xsk->fd < 0) {
err = -errno;
goto out_xsk_alloc;
}
} else {
// 如果引用次数为 0,会使用 umem 中已经创建了的套接字 fd
xsk->fd = umem->fd;
rx_setup_done = umem->rx_ring_setup_done;
tx_setup_done = umem->tx_ring_setup_done;
}
// 获取 netns_cookie 用于命名空间
optlen = sizeof(netns_cookie);
err = getsockopt(xsk->fd, SOL_SOCKET, SO_NETNS_COOKIE, &netns_cookie, &optlen);
if (err) {
if (errno != ENOPROTOOPT) {
err = -errno;
goto out_socket;
}
netns_cookie = INIT_NS;
}
// 获取ctx,一个ctx对应一个 netns_cookie、ifindex、queue_id 的组合,每一种这样的组合都需要一对 fill 和 comp ring
// 如果获取到了,则 ctx 引用 +1
ctx = xsk_get_ctx(umem, netns_cookie, ifindex, queue_id);
if (!ctx) {
// 获取失败,并且 fill 和 comp 都为 NULL 则不用用于存放创建后的 fill 或 comp
if (!fill || !comp) {
err = -EFAULT;
goto out_socket;
}
// 创建 ctx,详看后续函数分析
ctx = xsk_create_ctx(xsk, umem, netns_cookie, ifindex, ifname, queue_id,
fill, comp);
if (!ctx) {
err = -ENOMEM;
goto out_socket;
}
}
xsk->ctx = ctx;
// rx 不为 NUll 并且 rx_setup_done 为 false(如果是新创建的则肯定为false,如果是使用umem中的,则看umem->rx_ring_setup_done)
if (rx && !rx_setup_done) {
// 设置 rx ring 大小
err = setsockopt(xsk->fd, SOL_XDP, XDP_RX_RING,
&xsk->config.rx_size,
sizeof(xsk->config.rx_size));
if (err) {
err = -errno;
goto out_put_ctx;
}
// 如果是使用 umem 中的,则 umem->rx_ring_setup_done = true
if (xsk->fd == umem->fd)
umem->rx_ring_setup_done = true;
}
// tx 不为 NUll 并且 tx_setup_done 为 false(如果是新创建的则肯定为false,如果是使用umem中的,则看umem->tx_ring_setup_done)
if (tx && !tx_setup_done) {
// 设置 tx ring 大小
err = setsockopt(xsk->fd, SOL_XDP, XDP_TX_RING,
&xsk->config.tx_size,
sizeof(xsk->config.tx_size));
if (err) {
err = -errno;
goto out_put_ctx;
}
// 如果是使用 umem 中的,则 umem->tx_ring_setup_done = true
if (xsk->fd == umem->fd)
umem->tx_ring_setup_done = true;
}
// 获取偏移量,用户后面用户态维护的 rx ring 和 tx ring 的映射
err = xsk_get_mmap_offsets(xsk->fd, &off);
if (err) {
err = -errno;
goto out_put_ctx;
}
// 如果 rx 不为 NULL,则设置 rx 做映射
if (rx) {
rx_map = mmap(NULL, off.rx.desc +
xsk->config.rx_size * sizeof(struct xdp_desc),
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
xsk->fd, XDP_PGOFF_RX_RING);
if (rx_map == MAP_FAILED) {
err = -errno;
goto out_put_ctx;
}
rx->mask = xsk->config.rx_size - 1;
rx->size = xsk->config.rx_size;
rx->producer = rx_map + off.rx.producer;
rx->consumer = rx_map + off.rx.consumer;
rx->flags = rx_map + off.rx.flags;
rx->ring = rx_map + off.rx.desc;
rx->cached_prod = *rx->producer;
rx->cached_cons = *rx->consumer;
}
xsk->rx = rx;
// 如果 tx 不为 NULL,则设置 tx 做映射
if (tx) {
tx_map = mmap(NULL, off.tx.desc +
xsk->config.tx_size * sizeof(struct xdp_desc),
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
xsk->fd, XDP_PGOFF_TX_RING);
if (tx_map == MAP_FAILED) {
err = -errno;
goto out_mmap_rx;
}
tx->mask = xsk->config.tx_size - 1;
tx->size = xsk->config.tx_size;
tx->producer = tx_map + off.tx.producer;
tx->consumer = tx_map + off.tx.consumer;
tx->flags = tx_map + off.tx.flags;
tx->ring = tx_map + off.tx.desc;
tx->cached_prod = *tx->producer;
/* cached_cons is r->size bigger than the real consumer pointer
* See xsk_prod_nb_free
*/
tx->cached_cons = *tx->consumer + xsk->config.tx_size;
}
xsk->tx = tx;
sxdp.sxdp_family = PF_XDP;
sxdp.sxdp_ifindex = ctx->ifindex;
sxdp.sxdp_queue_id = ctx->queue_id;
if (umem->refcount > 1) {
sxdp.sxdp_flags |= XDP_SHARED_UMEM;
sxdp.sxdp_shared_umem_fd = umem->fd;
} else {
sxdp.sxdp_flags = xsk->config.bind_flags;
}
// 这里的 bind 可以理解成绑定之前设置的umem(或共享的)、fill、comp到套接字(最开始设置时fill和comp只是暂存,实际内核在这个阶段创建了pool)
err = bind(xsk->fd, (struct sockaddr *)&sxdp, sizeof(sxdp));
if (err) {
err = -errno;
goto out_mmap_tx;
}
// 如果不禁止 prog 加载,则自动载入默认xdp程序(不太重要,本文不考虑)
if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) {
err = __xsk_setup_xdp_prog(xsk, NULL);
if (err)
goto out_mmap_tx;
}
// 指向创建好的 socket
*xsk_ptr = xsk;
// 暂存的 fill 和 comp 已被使用,如果还想要,那就创建 ctx 吧
umem->fill_save = NULL;
umem->comp_save = NULL;
return 0;
out_mmap_tx:
if (tx)
munmap(tx_map, off.tx.desc +
xsk->config.tx_size * sizeof(struct xdp_desc));
out_mmap_rx:
if (rx)
munmap(rx_map, off.rx.desc +
xsk->config.rx_size * sizeof(struct xdp_desc));
out_put_ctx:
unmap = umem->fill_save != fill;
xsk_put_ctx(ctx, unmap);
out_socket:
if (--umem->refcount)
close(xsk->fd);
out_xsk_alloc:
free(xsk);
return err;
}
xsk_create_ctx
/**
* xsk_create_ctx - 创建一个新的上下文
* @xsk: xsk_socket
* @umem: xsk_umem
* @netns_cookie: 用于网络命名空间
* @ifindex: 网卡 index
* @ifname: 网卡名称
* @queue_id: 队列 ID
* @fill: fill ring(会被修改,用于接收参数)
* @comp: comp ring(会被修改,用于接收参数)
*/
static struct xsk_ctx *xsk_create_ctx(struct xsk_socket *xsk,
struct xsk_umem *umem, __u64 netns_cookie, int ifindex,
const char *ifname, __u32 queue_id,
struct xsk_ring_prod *fill,
struct xsk_ring_cons *comp)
{
struct xsk_ctx *ctx;
int err;
ctx = calloc(1, sizeof(*ctx));
if (!ctx)
return NULL;
// 检查暂存的 fill 和 comp 是否被用掉了
if (!umem->fill_save) {
// 被用掉了,创建新的 fill 和 comp
err = xsk_create_umem_rings(umem, xsk->fd, fill, comp);
if (err) {
free(ctx);
return NULL;
}
} else if (umem->fill_save != fill || umem->comp_save != comp) {
// 未被用掉,用之前暂存的
memcpy(fill, umem->fill_save, sizeof(*fill));
memcpy(comp, umem->comp_save, sizeof(*comp));
}
ctx->netns_cookie = netns_cookie;
ctx->ifindex = ifindex;
ctx->refcount = 1;
ctx->umem = umem;
ctx->queue_id = queue_id;
memcpy(ctx->ifname, ifname, IFNAMSIZ - 1);
ctx->ifname[IFNAMSIZ - 1] = '\0';
ctx->fill = fill;
ctx->comp = comp;
list_add(&ctx->list, &umem->ctx_list);
return ctx;
}
xsk_ring_prod__reserve
XDP_ALWAYS_INLINE __u32 xsk_ring_prod__reserve(struct xsk_ring_prod *prod, __u32 nb, __u32 *idx)
{
if (xsk_prod_nb_free(prod, nb) < nb)
return 0;
*idx = prod->cached_prod;
prod->cached_prod += nb;
return nb;
}
XDP_ALWAYS_INLINE __u32 xsk_prod_nb_free(struct xsk_ring_prod *r, __u32 nb)
{
__u32 free_entries = r->cached_cons - r->cached_prod;
if (free_entries >= nb)
return free_entries;
/* Refresh the local tail pointer.
* cached_cons is r->size bigger than the real consumer pointer so
* that this addition can be avoided in the more frequently
* executed code that computs free_entries in the beginning of
* this function. Without this optimization it whould have been
* free_entries = r->cached_prod - r->cached_cons + r->size.
*/
r->cached_cons = __atomic_load_n(r->consumer, __ATOMIC_ACQUIRE);
r->cached_cons += r->size;
return r->cached_cons - r->cached_prod;
}
Q&A
-
关于 fill 和 comp 的偏移和映射问题?
- 参看后面对内核结构体
xdp_umem_ring
的介绍
- 参看后面对内核结构体
-
关于 rx 和 tx 的偏移和映射问题?
- 参看后面对内核结构体
xdp_rxtx_ring
的介绍
- 参看后面对内核结构体
-
关于 setsockopt
- 所有的 set 项都只能 set 一次,且 set 只能在 bind 之前执行
- 当 set
XDP_RX_RING
、XDP_TX_RING
、XDP_UMEM_FILL_RING
、XDP_UMEM_COMPLETION_RING
时,内核会根据 ring 大小分配内存,创建一个环形队列(后面用户态需要将这个环形队列映射到用户空间来) - 设置情况
XDP_RX_RING
和XDP_TX_RING
至少要被 set 一个- 如果 flag 为
XDP_SHARED_UMEM
XDP_UMEM_REG
不要被 set,会自动从sxdp_shared_umem_fd
对应的套接字中找 umem- 如果两个 socket 的队列 ID 和网卡 index 都相同
- 则
XDP_UMEM_FILL_RING
和XDP_UMEM_COMPLETION_RING
也不要 set
- 则
- 否则,还是需要 set
XDP_UMEM_FILL_RING
和XDP_UMEM_COMPLETION_RING
的(不过在内核代码里我没找到相关的校验,欢迎大家指正)
- 如果两个 socket 的队列 ID 和网卡 index 都相同
内核中的xsk
涉及到的代码文件基本位于
net/xdp
目录下
数据结构
socket
/**
* struct socket - general BSD socket
* @state: socket state (%SS_CONNECTED, etc)
* @type: socket type (%SOCK_STREAM, etc)
* @flags: socket flags (%SOCK_NOSPACE, etc)
* @ops: protocol specific socket operations
* @file: File back pointer for gc
* @sk: internal networking protocol agnostic socket representation
* @wq: wait queue for several uses
*/
struct socket {
socket_state state;
short type;
unsigned long flags;
struct file *file;
struct sock *sk; // 在这里,sk 可以指向 xdp_sock
const struct proto_ops *ops; /* Might change with IPV6_ADDRFORM or MPTCP. */
struct socket_wq wq;
};
xdp_sock
struct xdp_sock {
/* struct sock must be the first member of struct xdp_sock */
struct sock sk;
struct xsk_queue *rx ____cacheline_aligned_in_smp;
struct net_device *dev;
struct xdp_umem *umem;
struct list_head flush_node;
struct xsk_buff_pool *pool;
u16 queue_id;
bool zc;
bool sg;
enum {
XSK_READY = 0,
XSK_BOUND,
XSK_UNBOUND,
} state;
struct xsk_queue *tx ____cacheline_aligned_in_smp;
struct list_head tx_list;
/* Protects generic receive. */
spinlock_t rx_lock;
/* Statistics */
u64 rx_dropped;
u64 rx_queue_full;
/* When __xsk_generic_xmit() must return before it sees the EOP descriptor for the current
* packet, the partially built skb is saved here so that packet building can resume in next
* call of __xsk_generic_xmit().
*/
struct sk_buff *skb;
struct list_head map_list;
/* Protects map_list */
spinlock_t map_list_lock;
/* Protects multiple processes in the control path */
struct mutex mutex;
struct xsk_queue *fq_tmp; /* Only as tmp storage before bind */
struct xsk_queue *cq_tmp; /* Only as tmp storage before bind */
};
xsk_queue
struct xsk_queue {
u32 ring_mask;
u32 nentries;
u32 cached_prod;
u32 cached_cons;
struct xdp_ring *ring;
u64 invalid_descs;
u64 queue_empty_descs;
size_t ring_vmalloc_size;
};
xdp_ring
相当于子类包括 xdp_rxtx_ring
和 xdp_umem_ring
,用偏移量来获取 desc 数组
struct xdp_ring {
u32 producer ____cacheline_aligned_in_smp;
/* Hinder the adjacent cache prefetcher to prefetch the consumer
* pointer if the producer pointer is touched and vice versa.
*/
u32 pad1 ____cacheline_aligned_in_smp;
u32 consumer ____cacheline_aligned_in_smp;
u32 pad2 ____cacheline_aligned_in_smp;
u32 flags;
u32 pad3 ____cacheline_aligned_in_smp;
};
xdp_rxtx_ring
/* Used for the RX and TX queues for packets */
struct xdp_rxtx_ring {
struct xdp_ring ptrs;
struct xdp_desc desc[] ____cacheline_aligned_in_smp;
};
xdp_umem_ring
/* Used for the fill and completion queues for buffers */
struct xdp_umem_ring {
struct xdp_ring ptrs;
u64 desc[] ____cacheline_aligned_in_smp;
};
函数
xsk_setsockopt
输入参数
- level
- SOL_XDP
- optname
- XDP_RX_RING
- XDP_TX_RING
- XDP_UMEM_REG
- XDP_UMEM_FILL_RING
- XDP_UMEM_COMPLETION_RING
static int xsk_setsockopt(struct socket *sock, int level, int optname,
sockptr_t optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
struct xdp_sock *xs = xdp_sk(sk);
int err;
if (level != SOL_XDP)
return -ENOPROTOOPT;
switch (optname) {
case XDP_RX_RING:
case XDP_TX_RING:
{
struct xsk_queue **q;
int entries;
if (optlen < sizeof(entries))
return -EINVAL;
if (copy_from_sockptr(&entries, optval, sizeof(entries)))
return -EFAULT;
mutex_lock(&xs->mutex);
if (xs->state != XSK_READY) {
mutex_unlock(&xs->mutex);
return -EBUSY;
}
q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
err = xsk_init_queue(entries, q, false);
if (!err && optname == XDP_TX_RING)
/* Tx needs to be explicitly woken up the first time */
xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
mutex_unlock(&xs->mutex);
return err;
}
case XDP_UMEM_REG:
{
size_t mr_size = sizeof(struct xdp_umem_reg);
struct xdp_umem_reg mr = {};
struct xdp_umem *umem;
if (optlen < sizeof(struct xdp_umem_reg_v1))
return -EINVAL;
else if (optlen < sizeof(mr))
mr_size = sizeof(struct xdp_umem_reg_v1);
if (copy_from_sockptr(&mr, optval, mr_size))
return -EFAULT;
mutex_lock(&xs->mutex);
if (xs->state != XSK_READY || xs->umem) {
mutex_unlock(&xs->mutex);
return -EBUSY;
}
umem = xdp_umem_create(&mr);
if (IS_ERR(umem)) {
mutex_unlock(&xs->mutex);
return PTR_ERR(umem);
}
/* Make sure umem is ready before it can be seen by others */
smp_wmb();
WRITE_ONCE(xs->umem, umem);
mutex_unlock(&xs->mutex);
return 0;
}
case XDP_UMEM_FILL_RING:
case XDP_UMEM_COMPLETION_RING:
{
struct xsk_queue **q;
int entries;
if (optlen < sizeof(entries))
return -EINVAL;
if (copy_from_sockptr(&entries, optval, sizeof(entries)))
return -EFAULT;
mutex_lock(&xs->mutex);
if (xs->state != XSK_READY) {
mutex_unlock(&xs->mutex);
return -EBUSY;
}
q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp :
&xs->cq_tmp;
err = xsk_init_queue(entries, q, true);
mutex_unlock(&xs->mutex);
return err;
}
default:
break;
}
return -ENOPROTOOPT;
}
xsk_getsockopt
输入参数
- level
- SOL_XDP
- optname
- XDP_STATISTICS
- XDP_MMAP_OFFSETS
- XDP_OPTIONS(获取flags)
static int xsk_getsockopt(struct socket *sock, int level, int optname,
char __user *optval, int __user *optlen)
{
struct sock *sk = sock->sk;
struct xdp_sock *xs = xdp_sk(sk);
int len;
if (level != SOL_XDP)
return -ENOPROTOOPT;
if (get_user(len, optlen))
return -EFAULT;
if (len < 0)
return -EINVAL;
switch (optname) {
case XDP_STATISTICS:
{
struct xdp_statistics stats = {};
bool extra_stats = true;
size_t stats_size;
if (len < sizeof(struct xdp_statistics_v1)) {
return -EINVAL;
} else if (len < sizeof(stats)) {
extra_stats = false;
stats_size = sizeof(struct xdp_statistics_v1);
} else {
stats_size = sizeof(stats);
}
mutex_lock(&xs->mutex);
stats.rx_dropped = xs->rx_dropped;
if (extra_stats) {
stats.rx_ring_full = xs->rx_queue_full;
stats.rx_fill_ring_empty_descs =
xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx);
} else {
stats.rx_dropped += xs->rx_queue_full;
}
stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
mutex_unlock(&xs->mutex);
if (copy_to_user(optval, &stats, stats_size))
return -EFAULT;
if (put_user(stats_size, optlen))
return -EFAULT;
return 0;
}
case XDP_MMAP_OFFSETS:
{
struct xdp_mmap_offsets off;
struct xdp_mmap_offsets_v1 off_v1;
bool flags_supported = true;
void *to_copy;
if (len < sizeof(off_v1))
return -EINVAL;
else if (len < sizeof(off))
flags_supported = false;
if (flags_supported) {
/* xdp_ring_offset is identical to xdp_ring_offset_v1
* except for the flags field added to the end.
*/
xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
&off.rx);
xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
&off.tx);
xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
&off.fr);
xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
&off.cr);
off.rx.flags = offsetof(struct xdp_rxtx_ring,
ptrs.flags);
off.tx.flags = offsetof(struct xdp_rxtx_ring,
ptrs.flags);
off.fr.flags = offsetof(struct xdp_umem_ring,
ptrs.flags);
off.cr.flags = offsetof(struct xdp_umem_ring,
ptrs.flags);
len = sizeof(off);
to_copy = &off;
} else {
xsk_enter_rxtx_offsets(&off_v1.rx);
xsk_enter_rxtx_offsets(&off_v1.tx);
xsk_enter_umem_offsets(&off_v1.fr);
xsk_enter_umem_offsets(&off_v1.cr);
len = sizeof(off_v1);
to_copy = &off_v1;
}
if (copy_to_user(optval, to_copy, len))
return -EFAULT;
if (put_user(len, optlen))
return -EFAULT;
return 0;
}
case XDP_OPTIONS:
{
struct xdp_options opts = {};
if (len < sizeof(opts))
return -EINVAL;
mutex_lock(&xs->mutex);
if (xs->zc)
opts.flags |= XDP_OPTIONS_ZEROCOPY;
mutex_unlock(&xs->mutex);
len = sizeof(opts);
if (copy_to_user(optval, &opts, len))
return -EFAULT;
if (put_user(len, optlen))
return -EFAULT;
return 0;
}
default:
break;
}
return -EOPNOTSUPP;
}
xsk_mmap
把 q 中的 ring 映射出来
static int xsk_mmap(struct file *file, struct socket *sock,
struct vm_area_struct *vma)
{
loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
unsigned long size = vma->vm_end - vma->vm_start;
struct xdp_sock *xs = xdp_sk(sock->sk);
int state = READ_ONCE(xs->state);
struct xsk_queue *q = NULL;
if (state != XSK_READY && state != XSK_BOUND)
return -EBUSY;
if (offset == XDP_PGOFF_RX_RING) {
q = READ_ONCE(xs->rx);
} else if (offset == XDP_PGOFF_TX_RING) {
q = READ_ONCE(xs->tx);
} else {
/* Matches the smp_wmb() in XDP_UMEM_REG */
smp_rmb();
if (offset == XDP_UMEM_PGOFF_FILL_RING)
q = state == XSK_READY ? READ_ONCE(xs->fq_tmp) :
READ_ONCE(xs->pool->fq);
else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
q = state == XSK_READY ? READ_ONCE(xs->cq_tmp) :
READ_ONCE(xs->pool->cq);
}
if (!q)
return -EINVAL;
/* Matches the smp_wmb() in xsk_init_queue */
smp_rmb();
if (size > q->ring_vmalloc_size)
return -EINVAL;
return remap_vmalloc_range(vma, q->ring, 0);
}
xsk_bind
static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
{
struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
struct sock *sk = sock->sk;
struct xdp_sock *xs = xdp_sk(sk);
struct net_device *dev;
int bound_dev_if;
u32 flags, qid;
int err = 0;
if (addr_len < sizeof(struct sockaddr_xdp))
return -EINVAL;
if (sxdp->sxdp_family != AF_XDP)
return -EINVAL;
flags = sxdp->sxdp_flags;
if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
XDP_USE_NEED_WAKEUP | XDP_USE_SG))
return -EINVAL;
bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
if (bound_dev_if && bound_dev_if != sxdp->sxdp_ifindex)
return -EINVAL;
rtnl_lock();
mutex_lock(&xs->mutex);
if (xs->state != XSK_READY) {
err = -EBUSY;
goto out_release;
}
dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
if (!dev) {
err = -ENODEV;
goto out_release;
}
if (!xs->rx && !xs->tx) {
err = -EINVAL;
goto out_unlock;
}
qid = sxdp->sxdp_queue_id;
if (flags & XDP_SHARED_UMEM) {
struct xdp_sock *umem_xs;
struct socket *sock;
if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
(flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_USE_SG)) {
/* Cannot specify flags for shared sockets. */
err = -EINVAL;
goto out_unlock;
}
if (xs->umem) {
/* We have already our own. */
err = -EINVAL;
goto out_unlock;
}
sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
if (IS_ERR(sock)) {
err = PTR_ERR(sock);
goto out_unlock;
}
umem_xs = xdp_sk(sock->sk);
if (!xsk_is_bound(umem_xs)) {
err = -EBADF;
sockfd_put(sock);
goto out_unlock;
}
if (umem_xs->queue_id != qid || umem_xs->dev != dev) {
/* Share the umem with another socket on another qid
* and/or device.
*/
xs->pool = xp_create_and_assign_umem(xs,
umem_xs->umem);
if (!xs->pool) {
err = -ENOMEM;
sockfd_put(sock);
goto out_unlock;
}
err = xp_assign_dev_shared(xs->pool, umem_xs, dev,
qid);
if (err) {
xp_destroy(xs->pool);
xs->pool = NULL;
sockfd_put(sock);
goto out_unlock;
}
} else {
/* Share the buffer pool with the other socket. */
if (xs->fq_tmp || xs->cq_tmp) {
/* Do not allow setting your own fq or cq. */
err = -EINVAL;
sockfd_put(sock);
goto out_unlock;
}
xp_get_pool(umem_xs->pool);
xs->pool = umem_xs->pool;
/* If underlying shared umem was created without Tx
* ring, allocate Tx descs array that Tx batching API
* utilizes
*/
if (xs->tx && !xs->pool->tx_descs) {
err = xp_alloc_tx_descs(xs->pool, xs);
if (err) {
xp_put_pool(xs->pool);
xs->pool = NULL;
sockfd_put(sock);
goto out_unlock;
}
}
}
xdp_get_umem(umem_xs->umem);
WRITE_ONCE(xs->umem, umem_xs->umem);
sockfd_put(sock);
} else if (!xs->umem || !xsk_validate_queues(xs)) {
err = -EINVAL;
goto out_unlock;
} else {
/* This xsk has its own umem. */
xs->pool = xp_create_and_assign_umem(xs, xs->umem);
if (!xs->pool) {
err = -ENOMEM;
goto out_unlock;
}
err = xp_assign_dev(xs->pool, dev, qid, flags);
if (err) {
xp_destroy(xs->pool);
xs->pool = NULL;
goto out_unlock;
}
}
/* FQ and CQ are now owned by the buffer pool and cleaned up with it. */
xs->fq_tmp = NULL;
xs->cq_tmp = NULL;
xs->dev = dev;
xs->zc = xs->umem->zc;
xs->sg = !!(xs->umem->flags & XDP_UMEM_SG_FLAG);
xs->queue_id = qid;
xp_add_xsk(xs->pool, xs);
out_unlock:
if (err) {
dev_put(dev);
} else {
/* Matches smp_rmb() in bind() for shared umem
* sockets, and xsk_is_bound().
*/
smp_wmb();
WRITE_ONCE(xs->state, XSK_BOUND);
}
out_release:
mutex_unlock(&xs->mutex);
rtnl_unlock();
return err;
}