af_xdp 创建过程分析

af_xdp 创建过程分析

libxdp

数据结构

umem

struct xsk_umem {
    // 当调用 xsk_umem__create_with_fd 时,会将创建的 fill ring 和 comp ring 保存到 fill_save 和 comp_save 中
    // 当调用 xsk_socket__create 时,会自动将 fill_save 和 comp_save 传给 xsk_socket__create_shared,xsk_socket__create_shared 执行成功后,会将 fill_save 和 comp_save 设置为 NULL
	struct xsk_ring_prod *fill_save;
	struct xsk_ring_cons *comp_save;
    // 指向分配给 umem 的地址
	char *umem_area;
    // umem的配置信息
	struct xsk_umem_config config;
    // 套接字的文件描述符,可能是新创建套接字的,也可能是传入的
	int fd;
    // 引用次数,xsk_umem__create_with_fd 产生的引用次数为 0
	int refcount;
    // 链表,上下文(ctx),每一个 netns_cookie、ifindex、queue_id 确定一个上下文,同一个上下文内 fill ring 和 comp ring 共享 
	struct list_head ctx_list;
    // 标记 fd 所对应的套接字是否有 rx_ring 或者 tx_ring
	bool rx_ring_setup_done;
	bool tx_ring_setup_done;
};

xsk_ring_prod(生产者队列) 和 xsk_ring_cons (消费者队列)

#define DEFINE_XSK_RING(name) \
struct name { \
	__u32 cached_prod; \
	__u32 cached_cons; \
	__u32 mask; \
	__u32 size; \
	__u32 *producer; \
	__u32 *consumer; \
	void *ring; \
	__u32 *flags; \
}

DEFINE_XSK_RING(xsk_ring_prod);
DEFINE_XSK_RING(xsk_ring_cons);

xsk_umem_config

struct xsk_umem_config {
    // fill ring 的大小
	__u32 fill_size;
    // comp ring 的大小
	__u32 comp_size;
    // 帧大小
	__u32 frame_size;
    // 帧前面预留的空间,如果设置了,那么报文数据将不是从每个帧的起始位置开始存储
	__u32 frame_headroom;
	__u32 flags;
};

xsk_ctx

struct xsk_ctx {
	struct xsk_ring_prod *fill;
	struct xsk_ring_cons *comp;
	struct xsk_umem *umem;
	__u32 queue_id;
	int refcount;
	int ifindex;
	__u64 netns_cookie;
	int xsks_map_fd;
	struct list_head list;
	struct xdp_program *xdp_prog;
	int refcnt_map_fd;
	char ifname[IFNAMSIZ];
};

函数

xsk_umem__create

/**
 * xsk_umem__create - 创建一个 umem
 * @umem_ptr: umem 指针(如果创建成功,这个指针将指向创建好的 umem)
 * @umem_area: umem 映射到用户空间的地址(用户使用 mmap 等方式分配的内存地址)
 * @size: umem 映射到用户空间的大小(用户使用 mmap 等方式分配的内存大小,单位字节)
 * @fill: fill_ring (创建 umem 会自动创建一对 fill_ring 和 completion_ring,这里用于接收)
 * @comp: completion_ring(创建 umem 会自动创建一对 fill_ring 和 completion_ring,这里用于接收)
 * @usr_config: 用户配置(如果设置为NULL,则载入默认配置)
 */
int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area,
		     __u64 size, struct xsk_ring_prod *fill,
		     struct xsk_ring_cons *comp,
		     const struct xsk_umem_config *usr_config)
{
	// 将套接字 fd 设置为 -1(xsk_umem__create_with_fd 将会创建一个套接字),其他值原封不动传入
	return xsk_umem__create_with_fd(umem_ptr, -1, umem_area, size,
					fill, comp, usr_config);
}

xsk_umem__create_with_fd

/**
 * xsk_umem__create_with_fd - 根据给定的套接字 fd 创建 umem
 * @umem_ptr: umem 指针(如果创建成功,这个指针将指向创建好的 umem)
 * @fd: 套接字 fd(如果<0,会创建一个新的套接字;否则会利用当前套接字fd,并赋值给 umem.fd)
 * @umem_area: umem 映射到用户空间的地址(用户使用 mmap 等方式分配的内存地址)
 * @size: umem 映射到用户空间的大小(用户使用 mmap 等方式分配的内存大小,单位字节)
 * @fill: fill_ring (创建 umem 会自动创建一对 fill_ring 和 completion_ring,这里用于接收)
 * @comp: completion_ring(创建 umem 会自动创建一对 fill_ring 和 completion_ring,这里用于接收)
 * @usr_config: 用户配置(如果设置为NULL,则载入默认配置)
 */
int xsk_umem__create_with_fd(struct xsk_umem **umem_ptr, int fd,
			     void *umem_area, __u64 size,
			     struct xsk_ring_prod *fill,
			     struct xsk_ring_cons *comp,
			     const struct xsk_umem_config *usr_config)
{
	// 用于注册 umem 信息,与内核交互的结构体,
	struct xdp_umem_reg mr;
	struct xsk_umem *umem;
	int err;
	// 参数检查
	if (!umem_area || !umem_ptr || !fill || !comp)
		return -EFAULT;
	if (!size && !xsk_page_aligned(umem_area))
		return -EINVAL;

	umem = calloc(1, sizeof(*umem));
	if (!umem)
		return -ENOMEM;

	umem->fd = fd < 0 ? socket(AF_XDP, SOCK_RAW, 0) : fd;
	if (umem->fd < 0) {
		err = -errno;
		goto out_umem_alloc;
	}

	umem->umem_area = umem_area;
	INIT_LIST_HEAD(&umem->ctx_list);
	// 将 usr_config 复制到 umem->config 中(如果usr_config为NULL,则载入默认配置到 umem->config 中)
	xsk_set_umem_config(&umem->config, usr_config);

	memset(&mr, 0, sizeof(mr));
	mr.addr = (uintptr_t)umem_area;
	mr.len = size;
	mr.chunk_size = umem->config.frame_size;
	mr.headroom = umem->config.frame_headroom;
	mr.flags = umem->config.flags;

	// 将 umem 注册给内核中对应的套接字 umem->fd(内核中具体行为稍后分析)
	err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr));
	if (err) {
		err = -errno;
		goto out_socket;
	}
	// 创建 fill_ring 和 completion_ring,绑定到 umem->fd 的套接字,使用 umem-> config 的信息
	err = xsk_create_umem_rings(umem, umem->fd, fill, comp);
	if (err)
		goto out_socket;

	// 暂存创建的 fill_ring 和 completion_ring
	umem->fill_save = fill;
	umem->comp_save = comp;
	// 修改指针,指向创建的 umem
	*umem_ptr = umem;
	return 0;

out_socket:
	close(umem->fd);
out_umem_alloc:
	free(umem);
	return err;
}

xsk_create_umem_rings

/**
 * xsk_create_umem_rings() - 创建 fill_ring 和 completion_ring
 * @umem: 只是使用了 umem-> config 的信息
 * @fd: setsocket 时使用的套接字 fd,也就是创建 fill ring 和 completion ring 的套接字 fd
 * @fill: fill_ring(用于接收)
 * @comp: completion_ring(用于接收)
 */
static int xsk_create_umem_rings(struct xsk_umem *umem, int fd,
				 struct xsk_ring_prod *fill,
				 struct xsk_ring_cons *comp)
{
	// 获取偏移值,偏移值都是相对于结构体的起始位置的偏移
	struct xdp_mmap_offsets off;
	void *map;
	int err;
	// 设置 fill ring 的大小,设置后内核会给 fd 对应的套接字分配 fill ring 的结构体
	err = setsockopt(fd, SOL_XDP, XDP_UMEM_FILL_RING,
			 &umem->config.fill_size,
			 sizeof(umem->config.fill_size));
	if (err)
		return -errno;
	// 设置 completion ring 的大小,设置后内核会给 fd 对应的套接字分配 completion ring 的结构体
	err = setsockopt(fd, SOL_XDP, XDP_UMEM_COMPLETION_RING,
			 &umem->config.comp_size,
			 sizeof(umem->config.comp_size));
	if (err)
		return -errno;
	// 获取各个ring中各个字段的偏移值
	err = xsk_get_mmap_offsets(fd, &off);
	if (err)
		return -errno;
	// 将内核中分配的 fill ring 的地址,映射到 map 中(现在 map 就相当于 fd 的 fill_ring 结构体的起始地址)
	map = mmap(NULL, off.fr.desc + umem->config.fill_size * sizeof(__u64),
		   PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
		   XDP_UMEM_PGOFF_FILL_RING);
	if (map == MAP_FAILED)
		return -errno;
	// 设置用户态维护的 fill 结构体,其中和内核态 fill_ring 相关的部分使用偏移 
	fill->mask = umem->config.fill_size - 1;
	fill->size = umem->config.fill_size;
	fill->producer = map + off.fr.producer;
	fill->consumer = map + off.fr.consumer;
	fill->flags = map + off.fr.flags;
	fill->ring = map + off.fr.desc;
	fill->cached_cons = umem->config.fill_size;

	// 将内核中分配的 completion ring 的地址,映射到 map 中(现在 map 就相当于 fd 的 completion_ring 结构体的起始地址)
	map = mmap(NULL, off.cr.desc + umem->config.comp_size * sizeof(__u64),
		   PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
		   XDP_UMEM_PGOFF_COMPLETION_RING);
	if (map == MAP_FAILED) {
		err = -errno;
		goto out_mmap;
	}
	// 设置用户态维护的 completion 结构体,其中和内核态 completion_ring 相关的部分使用偏移
	comp->mask = umem->config.comp_size - 1;
	comp->size = umem->config.comp_size;
	comp->producer = map + off.cr.producer;
	comp->consumer = map + off.cr.consumer;
	comp->flags = map + off.cr.flags;
	comp->ring = map + off.cr.desc;

	return 0;

out_mmap:
	munmap(map, off.fr.desc + umem->config.fill_size * sizeof(__u64));
	return err;
}

xsk_socket__create

/**
 * xsk_socket__create - 创建一个 AF_XDP 套接字
 * @xsk_ptr: 用于接收创建的套接字
 * @ifname: 网卡名称
 * @queue_id: 网卡队列ID
 * @umem: 使用 xsk_umem__create 创建的 umem
 * @rx: rx ring(如果不为 NULL,则创建 rx ring,配置 rx)
 * @tx: tx ring(如果不为 NULL,则创建 tx ring,配置 tx)
 * @usr_config: 配置
 */
int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
		       __u32 queue_id, struct xsk_umem *umem,
		       struct xsk_ring_cons *rx, struct xsk_ring_prod *tx,
		       const struct xsk_socket_config *usr_config)
{
	if (!umem)
		return -EFAULT;
	// xsk_socket__create_shared 需要多传 fill ring 和 rx ring,这里将暂存在 umem 中的 fill ring 和 rx ring 传入
	return xsk_socket__create_shared(xsk_ptr, ifname, queue_id, umem,
					 rx, tx, umem->fill_save,
					 umem->comp_save, usr_config);
}

xsk_socket__create_shared

/**
 * xsk_socket__create_shared - 创建一个 AF_XDP 套接字
 * @xsk_ptr: 用于接收创建的套接字
 * @ifname: 网卡名称
 * @queue_id: 网卡队列ID
 * @umem: 使用 xsk_umem__create 创建的 umem
 * @rx: rx ring(如果不为 NULL,则创建 rx ring,配置 rx)
 * @tx: tx ring(如果不为 NULL,则创建 tx ring,配置 tx)
 * @fill: 如果没有对应 ctx 的话,fill 不能为 NULL(要用于创建 ctx)
 * @comp: 如果没有对应 ctx 的话,comp 不能为 NULL(要用于创建 ctx)
 * @usr_config: 配置
 */
int xsk_socket__create_shared(struct xsk_socket **xsk_ptr,
			      const char *ifname,
			      __u32 queue_id, struct xsk_umem *umem,
			      struct xsk_ring_cons *rx,
			      struct xsk_ring_prod *tx,
			      struct xsk_ring_prod *fill,
			      struct xsk_ring_cons *comp,
			      const struct xsk_socket_config *usr_config)
{
	bool rx_setup_done = false, tx_setup_done = false;
	void *rx_map = NULL, *tx_map = NULL;
	struct sockaddr_xdp sxdp = {};
	struct xdp_mmap_offsets off;
	struct xsk_socket *xsk;
	struct xsk_ctx *ctx;
	int err, ifindex;
	__u64 netns_cookie;
	socklen_t optlen;
	bool unmap;

	if (!umem || !xsk_ptr || !(rx || tx))
		return -EFAULT;

	xsk = calloc(1, sizeof(*xsk));
	if (!xsk)
		return -ENOMEM;
	// 将 usr_config 拷贝到 xsk->config,如果为 NULL,则载入默认参数。此外,还进行参数检查。
	err = xsk_set_xdp_socket_config(&xsk->config, usr_config);
	if (err)
		goto out_xsk_alloc;

	ifindex = if_nametoindex(ifname);
	if (!ifindex) {
		err = -errno;
		goto out_xsk_alloc;
	}
	// 刚刚创建的 umem 的 umem->refcount = 0
	if (umem->refcount++ > 0) {
		xsk->fd = socket(AF_XDP, SOCK_RAW, 0);
		if (xsk->fd < 0) {
			err = -errno;
			goto out_xsk_alloc;
		}
	} else {
		// 如果引用次数为 0,会使用 umem 中已经创建了的套接字 fd
		xsk->fd = umem->fd;
		rx_setup_done = umem->rx_ring_setup_done;
		tx_setup_done = umem->tx_ring_setup_done;
	}

	// 获取 netns_cookie 用于命名空间
	optlen = sizeof(netns_cookie);
	err = getsockopt(xsk->fd, SOL_SOCKET, SO_NETNS_COOKIE, &netns_cookie, &optlen);
	if (err) {
		if (errno != ENOPROTOOPT) {
			err = -errno;
			goto out_socket;
		}
		netns_cookie = INIT_NS;
	}

	// 获取ctx,一个ctx对应一个 netns_cookie、ifindex、queue_id 的组合,每一种这样的组合都需要一对 fill 和 comp ring
	// 如果获取到了,则 ctx 引用 +1
	ctx = xsk_get_ctx(umem, netns_cookie, ifindex, queue_id);
	if (!ctx) {
		// 获取失败,并且 fill 和 comp 都为 NULL 则不用用于存放创建后的 fill 或 comp
		if (!fill || !comp) {
			err = -EFAULT;
			goto out_socket;
		}
		// 创建 ctx,详看后续函数分析
		ctx = xsk_create_ctx(xsk, umem, netns_cookie, ifindex, ifname, queue_id,
				     fill, comp);
		if (!ctx) {
			err = -ENOMEM;
			goto out_socket;
		}
	}
	xsk->ctx = ctx;
	// rx 不为 NUll 并且 rx_setup_done 为 false(如果是新创建的则肯定为false,如果是使用umem中的,则看umem->rx_ring_setup_done)
	if (rx && !rx_setup_done) {
		// 设置 rx ring 大小
		err = setsockopt(xsk->fd, SOL_XDP, XDP_RX_RING,
				 &xsk->config.rx_size,
				 sizeof(xsk->config.rx_size));
		if (err) {
			err = -errno;
			goto out_put_ctx;
		}
		// 如果是使用 umem 中的,则 umem->rx_ring_setup_done = true
		if (xsk->fd == umem->fd)
			umem->rx_ring_setup_done = true;

	}
	// tx 不为 NUll 并且 tx_setup_done 为 false(如果是新创建的则肯定为false,如果是使用umem中的,则看umem->tx_ring_setup_done)
	if (tx && !tx_setup_done) {
		// 设置 tx ring 大小
		err = setsockopt(xsk->fd, SOL_XDP, XDP_TX_RING,
				 &xsk->config.tx_size,
				 sizeof(xsk->config.tx_size));
		if (err) {
			err = -errno;
			goto out_put_ctx;
		}
		// 如果是使用 umem 中的,则 umem->tx_ring_setup_done = true
		if (xsk->fd == umem->fd)
			umem->tx_ring_setup_done = true;
	}
	// 获取偏移量,用户后面用户态维护的 rx ring 和 tx ring 的映射
	err = xsk_get_mmap_offsets(xsk->fd, &off);
	if (err) {
		err = -errno;
		goto out_put_ctx;
	}
	// 如果 rx 不为 NULL,则设置 rx 做映射
	if (rx) {
		rx_map = mmap(NULL, off.rx.desc +
			      xsk->config.rx_size * sizeof(struct xdp_desc),
			      PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
			      xsk->fd, XDP_PGOFF_RX_RING);
		if (rx_map == MAP_FAILED) {
			err = -errno;
			goto out_put_ctx;
		}

		rx->mask = xsk->config.rx_size - 1;
		rx->size = xsk->config.rx_size;
		rx->producer = rx_map + off.rx.producer;
		rx->consumer = rx_map + off.rx.consumer;
		rx->flags = rx_map + off.rx.flags;
		rx->ring = rx_map + off.rx.desc;
		rx->cached_prod = *rx->producer;
		rx->cached_cons = *rx->consumer;
	}
	xsk->rx = rx;
	// 如果 tx 不为 NULL,则设置 tx 做映射
	if (tx) {
		tx_map = mmap(NULL, off.tx.desc +
			      xsk->config.tx_size * sizeof(struct xdp_desc),
			      PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
			      xsk->fd, XDP_PGOFF_TX_RING);
		if (tx_map == MAP_FAILED) {
			err = -errno;
			goto out_mmap_rx;
		}

		tx->mask = xsk->config.tx_size - 1;
		tx->size = xsk->config.tx_size;
		tx->producer = tx_map + off.tx.producer;
		tx->consumer = tx_map + off.tx.consumer;
		tx->flags = tx_map + off.tx.flags;
		tx->ring = tx_map + off.tx.desc;
		tx->cached_prod = *tx->producer;
		/* cached_cons is r->size bigger than the real consumer pointer
		 * See xsk_prod_nb_free
		 */
		tx->cached_cons = *tx->consumer + xsk->config.tx_size;
	}
	xsk->tx = tx;

	sxdp.sxdp_family = PF_XDP;
	sxdp.sxdp_ifindex = ctx->ifindex;
	sxdp.sxdp_queue_id = ctx->queue_id;
	if (umem->refcount > 1) {
		sxdp.sxdp_flags |= XDP_SHARED_UMEM;
		sxdp.sxdp_shared_umem_fd = umem->fd;
	} else {
		sxdp.sxdp_flags = xsk->config.bind_flags;
	}
	// 这里的 bind 可以理解成绑定之前设置的umem(或共享的)、fill、comp到套接字(最开始设置时fill和comp只是暂存,实际内核在这个阶段创建了pool)
	err = bind(xsk->fd, (struct sockaddr *)&sxdp, sizeof(sxdp));
	if (err) {
		err = -errno;
		goto out_mmap_tx;
	}
	// 如果不禁止 prog 加载,则自动载入默认xdp程序(不太重要,本文不考虑)
	if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) {
		err = __xsk_setup_xdp_prog(xsk, NULL);
		if (err)
			goto out_mmap_tx;
	}
	// 指向创建好的 socket
	*xsk_ptr = xsk;
	// 暂存的 fill 和 comp 已被使用,如果还想要,那就创建 ctx 吧
	umem->fill_save = NULL;
	umem->comp_save = NULL;
	return 0;

out_mmap_tx:
	if (tx)
		munmap(tx_map, off.tx.desc +
		       xsk->config.tx_size * sizeof(struct xdp_desc));
out_mmap_rx:
	if (rx)
		munmap(rx_map, off.rx.desc +
		       xsk->config.rx_size * sizeof(struct xdp_desc));
out_put_ctx:
	unmap = umem->fill_save != fill;
	xsk_put_ctx(ctx, unmap);
out_socket:
	if (--umem->refcount)
		close(xsk->fd);
out_xsk_alloc:
	free(xsk);
	return err;
}

xsk_create_ctx

/**
 * xsk_create_ctx - 创建一个新的上下文
 * @xsk: xsk_socket
 * @umem: xsk_umem
 * @netns_cookie: 用于网络命名空间
 * @ifindex: 网卡 index
 * @ifname: 网卡名称
 * @queue_id: 队列 ID
 * @fill: fill ring(会被修改,用于接收参数)
 * @comp: comp ring(会被修改,用于接收参数)
 */
static struct xsk_ctx *xsk_create_ctx(struct xsk_socket *xsk,
				      struct xsk_umem *umem, __u64 netns_cookie, int ifindex,
				      const char *ifname, __u32 queue_id,
				      struct xsk_ring_prod *fill,
				      struct xsk_ring_cons *comp)
{
	struct xsk_ctx *ctx;
	int err;

	ctx = calloc(1, sizeof(*ctx));
	if (!ctx)
		return NULL;
	// 检查暂存的 fill 和 comp 是否被用掉了
	if (!umem->fill_save) {
		// 被用掉了,创建新的 fill 和 comp
		err = xsk_create_umem_rings(umem, xsk->fd, fill, comp);
		if (err) {
			free(ctx);
			return NULL;
		}
	} else if (umem->fill_save != fill || umem->comp_save != comp) {
		// 未被用掉,用之前暂存的
		memcpy(fill, umem->fill_save, sizeof(*fill));
		memcpy(comp, umem->comp_save, sizeof(*comp));
	}

	ctx->netns_cookie = netns_cookie;
	ctx->ifindex = ifindex;
	ctx->refcount = 1;
	ctx->umem = umem;
	ctx->queue_id = queue_id;
	memcpy(ctx->ifname, ifname, IFNAMSIZ - 1);
	ctx->ifname[IFNAMSIZ - 1] = '\0';

	ctx->fill = fill;
	ctx->comp = comp;
	list_add(&ctx->list, &umem->ctx_list);
	return ctx;
}

xsk_ring_prod__reserve

XDP_ALWAYS_INLINE __u32 xsk_ring_prod__reserve(struct xsk_ring_prod *prod, __u32 nb, __u32 *idx)
{
	if (xsk_prod_nb_free(prod, nb) < nb)
		return 0;

	*idx = prod->cached_prod;
	prod->cached_prod += nb;

	return nb;
}
XDP_ALWAYS_INLINE __u32 xsk_prod_nb_free(struct xsk_ring_prod *r, __u32 nb)
{
	__u32 free_entries = r->cached_cons - r->cached_prod;

	if (free_entries >= nb)
		return free_entries;

	/* Refresh the local tail pointer.
	 * cached_cons is r->size bigger than the real consumer pointer so
	 * that this addition can be avoided in the more frequently
	 * executed code that computs free_entries in the beginning of
	 * this function. Without this optimization it whould have been
	 * free_entries = r->cached_prod - r->cached_cons + r->size.
	 */
	r->cached_cons = __atomic_load_n(r->consumer, __ATOMIC_ACQUIRE);
	r->cached_cons += r->size;

	return r->cached_cons - r->cached_prod;
}

Q&A

  • 关于 fill 和 comp 的偏移和映射问题?

    • 参看后面对内核结构体 xdp_umem_ring 的介绍
  • 关于 rx 和 tx 的偏移和映射问题?

    • 参看后面对内核结构体 xdp_rxtx_ring 的介绍
  • 关于 setsockopt

    • 所有的 set 项都只能 set 一次,且 set 只能在 bind 之前执行
    • 当 set XDP_RX_RINGXDP_TX_RINGXDP_UMEM_FILL_RINGXDP_UMEM_COMPLETION_RING 时,内核会根据 ring 大小分配内存,创建一个环形队列(后面用户态需要将这个环形队列映射到用户空间来)
    • 设置情况
      • XDP_RX_RINGXDP_TX_RING 至少要被 set 一个
      • 如果 flag 为 XDP_SHARED_UMEM
        • XDP_UMEM_REG 不要被 set,会自动从 sxdp_shared_umem_fd 对应的套接字中找 umem
          • 如果两个 socket 的队列 ID 和网卡 index 都相同
            • XDP_UMEM_FILL_RINGXDP_UMEM_COMPLETION_RING 也不要 set
          • 否则,还是需要 set XDP_UMEM_FILL_RINGXDP_UMEM_COMPLETION_RING 的(不过在内核代码里我没找到相关的校验,欢迎大家指正)

内核中的xsk

涉及到的代码文件基本位于 net/xdp 目录下

数据结构

socket

/**
 *  struct socket - general BSD socket
 *  @state: socket state (%SS_CONNECTED, etc)
 *  @type: socket type (%SOCK_STREAM, etc)
 *  @flags: socket flags (%SOCK_NOSPACE, etc)
 *  @ops: protocol specific socket operations
 *  @file: File back pointer for gc
 *  @sk: internal networking protocol agnostic socket representation
 *  @wq: wait queue for several uses
 */
struct socket {
	socket_state		state;

	short			type;

	unsigned long		flags;

	struct file		*file;
	struct sock		*sk; // 在这里,sk 可以指向 xdp_sock
	const struct proto_ops	*ops; /* Might change with IPV6_ADDRFORM or MPTCP. */

	struct socket_wq	wq;
};

xdp_sock

struct xdp_sock {
	/* struct sock must be the first member of struct xdp_sock */
	struct sock sk;
	struct xsk_queue *rx ____cacheline_aligned_in_smp;
	struct net_device *dev;
	struct xdp_umem *umem;
	struct list_head flush_node;
	struct xsk_buff_pool *pool;
	u16 queue_id;
	bool zc;
	bool sg;
	enum {
		XSK_READY = 0,
		XSK_BOUND,
		XSK_UNBOUND,
	} state;

	struct xsk_queue *tx ____cacheline_aligned_in_smp;
	struct list_head tx_list;
	/* Protects generic receive. */
	spinlock_t rx_lock;

	/* Statistics */
	u64 rx_dropped;
	u64 rx_queue_full;

	/* When __xsk_generic_xmit() must return before it sees the EOP descriptor for the current
	 * packet, the partially built skb is saved here so that packet building can resume in next
	 * call of __xsk_generic_xmit().
	 */
	struct sk_buff *skb;

	struct list_head map_list;
	/* Protects map_list */
	spinlock_t map_list_lock;
	/* Protects multiple processes in the control path */
	struct mutex mutex;
	struct xsk_queue *fq_tmp; /* Only as tmp storage before bind */
	struct xsk_queue *cq_tmp; /* Only as tmp storage before bind */
};

xsk_queue

struct xsk_queue {
	u32 ring_mask;
	u32 nentries;
	u32 cached_prod;
	u32 cached_cons;
	struct xdp_ring *ring;
	u64 invalid_descs;
	u64 queue_empty_descs;
	size_t ring_vmalloc_size;
};

xdp_ring

相当于子类包括 xdp_rxtx_ringxdp_umem_ring,用偏移量来获取 desc 数组

struct xdp_ring {
	u32 producer ____cacheline_aligned_in_smp;
	/* Hinder the adjacent cache prefetcher to prefetch the consumer
	 * pointer if the producer pointer is touched and vice versa.
	 */
	u32 pad1 ____cacheline_aligned_in_smp;
	u32 consumer ____cacheline_aligned_in_smp;
	u32 pad2 ____cacheline_aligned_in_smp;
	u32 flags;
	u32 pad3 ____cacheline_aligned_in_smp;
};

xdp_rxtx_ring

/* Used for the RX and TX queues for packets */
struct xdp_rxtx_ring {
	struct xdp_ring ptrs;
	struct xdp_desc desc[] ____cacheline_aligned_in_smp;
};

xdp_umem_ring

/* Used for the fill and completion queues for buffers */
struct xdp_umem_ring {
	struct xdp_ring ptrs;
	u64 desc[] ____cacheline_aligned_in_smp;
};

函数

xsk_setsockopt

输入参数

  • level
    • SOL_XDP
  • optname
    • XDP_RX_RING
    • XDP_TX_RING
    • XDP_UMEM_REG
    • XDP_UMEM_FILL_RING
    • XDP_UMEM_COMPLETION_RING
static int xsk_setsockopt(struct socket *sock, int level, int optname,
			  sockptr_t optval, unsigned int optlen)
{
	struct sock *sk = sock->sk;
	struct xdp_sock *xs = xdp_sk(sk);
	int err;

	if (level != SOL_XDP)
		return -ENOPROTOOPT;

	switch (optname) {
	case XDP_RX_RING:
	case XDP_TX_RING:
	{
		struct xsk_queue **q;
		int entries;

		if (optlen < sizeof(entries))
			return -EINVAL;
		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
			return -EFAULT;

		mutex_lock(&xs->mutex);
		if (xs->state != XSK_READY) {
			mutex_unlock(&xs->mutex);
			return -EBUSY;
		}
		q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
		err = xsk_init_queue(entries, q, false);
		if (!err && optname == XDP_TX_RING)
			/* Tx needs to be explicitly woken up the first time */
			xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
		mutex_unlock(&xs->mutex);
		return err;
	}
	case XDP_UMEM_REG:
	{
		size_t mr_size = sizeof(struct xdp_umem_reg);
		struct xdp_umem_reg mr = {};
		struct xdp_umem *umem;

		if (optlen < sizeof(struct xdp_umem_reg_v1))
			return -EINVAL;
		else if (optlen < sizeof(mr))
			mr_size = sizeof(struct xdp_umem_reg_v1);

		if (copy_from_sockptr(&mr, optval, mr_size))
			return -EFAULT;

		mutex_lock(&xs->mutex);
		if (xs->state != XSK_READY || xs->umem) {
			mutex_unlock(&xs->mutex);
			return -EBUSY;
		}

		umem = xdp_umem_create(&mr);
		if (IS_ERR(umem)) {
			mutex_unlock(&xs->mutex);
			return PTR_ERR(umem);
		}

		/* Make sure umem is ready before it can be seen by others */
		smp_wmb();
		WRITE_ONCE(xs->umem, umem);
		mutex_unlock(&xs->mutex);
		return 0;
	}
	case XDP_UMEM_FILL_RING:
	case XDP_UMEM_COMPLETION_RING:
	{
		struct xsk_queue **q;
		int entries;

		if (optlen < sizeof(entries))
			return -EINVAL;
		if (copy_from_sockptr(&entries, optval, sizeof(entries)))
			return -EFAULT;

		mutex_lock(&xs->mutex);
		if (xs->state != XSK_READY) {
			mutex_unlock(&xs->mutex);
			return -EBUSY;
		}

		q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp :
			&xs->cq_tmp;
		err = xsk_init_queue(entries, q, true);
		mutex_unlock(&xs->mutex);
		return err;
	}
	default:
		break;
	}

	return -ENOPROTOOPT;
}

xsk_getsockopt

输入参数

  • level
    • SOL_XDP
  • optname
    • XDP_STATISTICS
    • XDP_MMAP_OFFSETS
    • XDP_OPTIONS(获取flags)
static int xsk_getsockopt(struct socket *sock, int level, int optname,
			  char __user *optval, int __user *optlen)
{
	struct sock *sk = sock->sk;
	struct xdp_sock *xs = xdp_sk(sk);
	int len;

	if (level != SOL_XDP)
		return -ENOPROTOOPT;

	if (get_user(len, optlen))
		return -EFAULT;
	if (len < 0)
		return -EINVAL;

	switch (optname) {
	case XDP_STATISTICS:
	{
		struct xdp_statistics stats = {};
		bool extra_stats = true;
		size_t stats_size;

		if (len < sizeof(struct xdp_statistics_v1)) {
			return -EINVAL;
		} else if (len < sizeof(stats)) {
			extra_stats = false;
			stats_size = sizeof(struct xdp_statistics_v1);
		} else {
			stats_size = sizeof(stats);
		}

		mutex_lock(&xs->mutex);
		stats.rx_dropped = xs->rx_dropped;
		if (extra_stats) {
			stats.rx_ring_full = xs->rx_queue_full;
			stats.rx_fill_ring_empty_descs =
				xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
			stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx);
		} else {
			stats.rx_dropped += xs->rx_queue_full;
		}
		stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
		stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
		mutex_unlock(&xs->mutex);

		if (copy_to_user(optval, &stats, stats_size))
			return -EFAULT;
		if (put_user(stats_size, optlen))
			return -EFAULT;

		return 0;
	}
	case XDP_MMAP_OFFSETS:
	{
		struct xdp_mmap_offsets off;
		struct xdp_mmap_offsets_v1 off_v1;
		bool flags_supported = true;
		void *to_copy;

		if (len < sizeof(off_v1))
			return -EINVAL;
		else if (len < sizeof(off))
			flags_supported = false;

		if (flags_supported) {
			/* xdp_ring_offset is identical to xdp_ring_offset_v1
			 * except for the flags field added to the end.
			 */
			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
					       &off.rx);
			xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
					       &off.tx);
			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
					       &off.fr);
			xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
					       &off.cr);
			off.rx.flags = offsetof(struct xdp_rxtx_ring,
						ptrs.flags);
			off.tx.flags = offsetof(struct xdp_rxtx_ring,
						ptrs.flags);
			off.fr.flags = offsetof(struct xdp_umem_ring,
						ptrs.flags);
			off.cr.flags = offsetof(struct xdp_umem_ring,
						ptrs.flags);

			len = sizeof(off);
			to_copy = &off;
		} else {
			xsk_enter_rxtx_offsets(&off_v1.rx);
			xsk_enter_rxtx_offsets(&off_v1.tx);
			xsk_enter_umem_offsets(&off_v1.fr);
			xsk_enter_umem_offsets(&off_v1.cr);

			len = sizeof(off_v1);
			to_copy = &off_v1;
		}

		if (copy_to_user(optval, to_copy, len))
			return -EFAULT;
		if (put_user(len, optlen))
			return -EFAULT;

		return 0;
	}
	case XDP_OPTIONS:
	{
		struct xdp_options opts = {};

		if (len < sizeof(opts))
			return -EINVAL;

		mutex_lock(&xs->mutex);
		if (xs->zc)
			opts.flags |= XDP_OPTIONS_ZEROCOPY;
		mutex_unlock(&xs->mutex);

		len = sizeof(opts);
		if (copy_to_user(optval, &opts, len))
			return -EFAULT;
		if (put_user(len, optlen))
			return -EFAULT;

		return 0;
	}
	default:
		break;
	}

	return -EOPNOTSUPP;
}

xsk_mmap

把 q 中的 ring 映射出来

static int xsk_mmap(struct file *file, struct socket *sock,
		    struct vm_area_struct *vma)
{
	loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
	unsigned long size = vma->vm_end - vma->vm_start;
	struct xdp_sock *xs = xdp_sk(sock->sk);
	int state = READ_ONCE(xs->state);
	struct xsk_queue *q = NULL;

	if (state != XSK_READY && state != XSK_BOUND)
		return -EBUSY;

	if (offset == XDP_PGOFF_RX_RING) {
		q = READ_ONCE(xs->rx);
	} else if (offset == XDP_PGOFF_TX_RING) {
		q = READ_ONCE(xs->tx);
	} else {
		/* Matches the smp_wmb() in XDP_UMEM_REG */
		smp_rmb();
		if (offset == XDP_UMEM_PGOFF_FILL_RING)
			q = state == XSK_READY ? READ_ONCE(xs->fq_tmp) :
						 READ_ONCE(xs->pool->fq);
		else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
			q = state == XSK_READY ? READ_ONCE(xs->cq_tmp) :
						 READ_ONCE(xs->pool->cq);
	}

	if (!q)
		return -EINVAL;

	/* Matches the smp_wmb() in xsk_init_queue */
	smp_rmb();
	if (size > q->ring_vmalloc_size)
		return -EINVAL;

	return remap_vmalloc_range(vma, q->ring, 0);
}

xsk_bind

static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
{
	struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
	struct sock *sk = sock->sk;
	struct xdp_sock *xs = xdp_sk(sk);
	struct net_device *dev;
	int bound_dev_if;
	u32 flags, qid;
	int err = 0;

	if (addr_len < sizeof(struct sockaddr_xdp))
		return -EINVAL;
	if (sxdp->sxdp_family != AF_XDP)
		return -EINVAL;

	flags = sxdp->sxdp_flags;
	if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
		      XDP_USE_NEED_WAKEUP | XDP_USE_SG))
		return -EINVAL;

	bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
	if (bound_dev_if && bound_dev_if != sxdp->sxdp_ifindex)
		return -EINVAL;

	rtnl_lock();
	mutex_lock(&xs->mutex);
	if (xs->state != XSK_READY) {
		err = -EBUSY;
		goto out_release;
	}

	dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
	if (!dev) {
		err = -ENODEV;
		goto out_release;
	}

	if (!xs->rx && !xs->tx) {
		err = -EINVAL;
		goto out_unlock;
	}

	qid = sxdp->sxdp_queue_id;

	if (flags & XDP_SHARED_UMEM) {
		struct xdp_sock *umem_xs;
		struct socket *sock;

		if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
		    (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_USE_SG)) {
			/* Cannot specify flags for shared sockets. */
			err = -EINVAL;
			goto out_unlock;
		}

		if (xs->umem) {
			/* We have already our own. */
			err = -EINVAL;
			goto out_unlock;
		}

		sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
		if (IS_ERR(sock)) {
			err = PTR_ERR(sock);
			goto out_unlock;
		}

		umem_xs = xdp_sk(sock->sk);
		if (!xsk_is_bound(umem_xs)) {
			err = -EBADF;
			sockfd_put(sock);
			goto out_unlock;
		}

		if (umem_xs->queue_id != qid || umem_xs->dev != dev) {
			/* Share the umem with another socket on another qid
			 * and/or device.
			 */
			xs->pool = xp_create_and_assign_umem(xs,
							     umem_xs->umem);
			if (!xs->pool) {
				err = -ENOMEM;
				sockfd_put(sock);
				goto out_unlock;
			}

			err = xp_assign_dev_shared(xs->pool, umem_xs, dev,
						   qid);
			if (err) {
				xp_destroy(xs->pool);
				xs->pool = NULL;
				sockfd_put(sock);
				goto out_unlock;
			}
		} else {
			/* Share the buffer pool with the other socket. */
			if (xs->fq_tmp || xs->cq_tmp) {
				/* Do not allow setting your own fq or cq. */
				err = -EINVAL;
				sockfd_put(sock);
				goto out_unlock;
			}

			xp_get_pool(umem_xs->pool);
			xs->pool = umem_xs->pool;

			/* If underlying shared umem was created without Tx
			 * ring, allocate Tx descs array that Tx batching API
			 * utilizes
			 */
			if (xs->tx && !xs->pool->tx_descs) {
				err = xp_alloc_tx_descs(xs->pool, xs);
				if (err) {
					xp_put_pool(xs->pool);
					xs->pool = NULL;
					sockfd_put(sock);
					goto out_unlock;
				}
			}
		}

		xdp_get_umem(umem_xs->umem);
		WRITE_ONCE(xs->umem, umem_xs->umem);
		sockfd_put(sock);
	} else if (!xs->umem || !xsk_validate_queues(xs)) {
		err = -EINVAL;
		goto out_unlock;
	} else {
		/* This xsk has its own umem. */
		xs->pool = xp_create_and_assign_umem(xs, xs->umem);
		if (!xs->pool) {
			err = -ENOMEM;
			goto out_unlock;
		}

		err = xp_assign_dev(xs->pool, dev, qid, flags);
		if (err) {
			xp_destroy(xs->pool);
			xs->pool = NULL;
			goto out_unlock;
		}
	}

	/* FQ and CQ are now owned by the buffer pool and cleaned up with it. */
	xs->fq_tmp = NULL;
	xs->cq_tmp = NULL;

	xs->dev = dev;
	xs->zc = xs->umem->zc;
	xs->sg = !!(xs->umem->flags & XDP_UMEM_SG_FLAG);
	xs->queue_id = qid;
	xp_add_xsk(xs->pool, xs);

out_unlock:
	if (err) {
		dev_put(dev);
	} else {
		/* Matches smp_rmb() in bind() for shared umem
		 * sockets, and xsk_is_bound().
		 */
		smp_wmb();
		WRITE_ONCE(xs->state, XSK_BOUND);
	}
out_release:
	mutex_unlock(&xs->mutex);
	rtnl_unlock();
	return err;
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值