流量控制Netlink接口

数据结构

Netlink消息: tcmsg

struct tcmsg
{
	unsigned char	tcm_family; // 对于tc,永远是AF_UNSPEC
	unsigned char	tcm__pad1; // 填充字段,无意义
	unsigned short	tcm__pad2;
	int	tcm_ifindex; // 关联的网络设备索引
	__u32 tcm_handle; // 要操作的对象句柄,可以为0让内核选择
	__u32 tcm_parent; // 父节点句柄
	__u32 tcm_info; // 自定义使用
};

系统初始化

在初始化时,向路由套接字注册了qdisc、class和filter的命令处理函数。

static int __init pktsched_init(void)
{
    // 注册两个默认的排队规则: pfifo和bfifo
	register_qdisc(&pfifo_qdisc_ops);
	register_qdisc(&bfifo_qdisc_ops);
	// 创建/proc/net/psched维持文件
	proc_net_fops_create(&init_net, "psched", 0, &psched_fops);

    // 在路由Netlink协议中注册qdisc和class的操作函数
	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
	return 0;
}
subsys_initcall(pktsched_init);

static int __init tc_filter_init(void)
{
    // 在路由Netlink协议中注册filter的操作函数
	rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, , NULL);
	rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL);
	rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter, tc_dump_tfilter);
	return 0;
}
subsys_initcall(tc_filter_init);

qdisc的Netlink接口

如初始化所示,排队规则支持添加、删除、查询三个接口,在看内核态实现之前,先来看看用户态tc(8)对qdisc命令的处理。

tc命令行格式

tc  [ OPTIONS ] qdisc [ add | change | replace | link | delete ] dev DEV [ parent qdisc-id | root ] \
    [ handle qdisc-id ] qdisc [ qdisc specific parameters ]
  • parent和root参数指定了qdisc的parent句柄;root表示是根qdisc,否则parent应该某个class ID;
  • handle参数指定了qdisc自己的句柄,qdisc句柄永远都是只有主号码;
  • qdisc参数指定具体qdisc的名字,如"htb";

用户态qdsic命令: do_qdisc()

int do_qdisc(int argc, char **argv)
{
	if (argc < 1)
		return tc_qdisc_list(0, NULL);
	if (matches(*argv, "add") == 0)
		return tc_qdisc_modify(RTM_NEWQDISC, NLM_F_EXCL|NLM_F_CREATE, argc-1, argv+1);
	if (matches(*argv, "change") == 0)
		return tc_qdisc_modify(RTM_NEWQDISC, 0, argc-1, argv+1);
	if (matches(*argv, "replace") == 0)
		return tc_qdisc_modify(RTM_NEWQDISC, NLM_F_CREATE|NLM_F_REPLACE, argc-1, argv+1);
	if (matches(*argv, "link") == 0)
		return tc_qdisc_modify(RTM_NEWQDISC, NLM_F_REPLACE, argc-1, argv+1);
	if (matches(*argv, "delete") == 0)
		return tc_qdisc_modify(RTM_DELQDISC, 0,  argc-1, argv+1);
	if (matches(*argv, "list") == 0 || matches(*argv, "show") == 0
	    || matches(*argv, "lst") == 0)
		return tc_qdisc_list(argc-1, argv+1);
	if (matches(*argv, "help") == 0) {
		usage();
		return 0;
	}
	fprintf(stderr, "Command \"%s\" is unknown, try \"tc qdisc help\".\n", *argv);
	return -1;
}

根据命令(增删改、查)分别调用不同函数处理,重点看tc_qdisc_modify()的实现。

修改类命令处理: tc_qdisc_modify()

static int tc_qdisc_modify(int cmd, unsigned int flags, int argc, char **argv)
{
	struct qdisc_util *q = NULL;
	struct tc_estimator est = {};
	struct {
		struct tc_sizespec	szopts;
		__u16			*data;
	} stab = {};
	char  d[16] = {}; // 保存命令行的网络设备名称
	char  k[16] = {}; // 保存qdisc的名称
	struct {
		struct nlmsghdr	n;
		struct tcmsg		t;
		char			buf[TCA_BUF_MAX];
	} req = {
		.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)),
		.n.nlmsg_flags = NLM_F_REQUEST | flags,
		.n.nlmsg_type = cmd,
		.t.tcm_family = AF_UNSPEC,
	};

	while (argc > 0) {
		if (strcmp(*argv, "dev") == 0) { // 解析网络设备参数
			NEXT_ARG();
			if (d[0])
				duparg("dev", *argv);
			strncpy(d, *argv, sizeof(d)-1);
		} else if (strcmp(*argv, "handle") == 0) { // 解析qdisc句柄
			__u32 handle;
			if (req.t.tcm_handle)
				duparg("handle", *argv);
			NEXT_ARG();
			if (get_qdisc_handle(&handle, *argv))
				invarg("invalid qdisc ID", *argv);
			req.t.tcm_handle = handle;
		} else if (strcmp(*argv, "root") == 0) { // 表示操作的是根qdisc,设置parent为TC_H_ROOT
			if (req.t.tcm_parent) {
				fprintf(stderr, "Error: \"root\" is duplicate parent ID\n");
				return -1;
			}
			req.t.tcm_parent = TC_H_ROOT;
		} else if (strcmp(*argv, "clsact") == 0) {
			if (req.t.tcm_parent) {
				fprintf(stderr, "Error: \"clsact\" is a duplicate parent ID\n");
				return -1;
			}
			req.t.tcm_parent = TC_H_CLSACT;
			strncpy(k, "clsact", sizeof(k) - 1);
			q = get_qdisc_kind(k);
			req.t.tcm_handle = TC_H_MAKE(TC_H_CLSACT, 0);
			NEXT_ARG_FWD();
			break;
		} else if (strcmp(*argv, "ingress") == 0) { // 表示要操作的是接收方向的qdisc
			if (req.t.tcm_parent) {
				fprintf(stderr, "Error: \"ingress\" is a duplicate parent ID\n");
				return -1;
			}
			req.t.tcm_parent = TC_H_INGRESS;
			strncpy(k, "ingress", sizeof(k) - 1);
			q = get_qdisc_kind(k);
			req.t.tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
			NEXT_ARG_FWD();
			break;
		} else if (strcmp(*argv, "parent") == 0) { // 解析parent句柄
			__u32 handle;
			NEXT_ARG();
			if (req.t.tcm_parent)
				duparg("parent", *argv);
			if (get_tc_classid(&handle, *argv))
				invarg("invalid parent ID", *argv);
			req.t.tcm_parent = handle;
		} else if (matches(*argv, "estimator") == 0) {
			if (parse_estimator(&argc, &argv, &est))
				return -1;
		} else if (matches(*argv, "stab") == 0) {
			if (parse_size_table(&argc, &argv, &stab.szopts) < 0)
				return -1;
			continue;
		} else if (matches(*argv, "help") == 0) {
			usage();
		} else {
		    // 解析具体qdsic的名字
			strncpy(k, *argv, sizeof(k)-1);
			q = get_qdisc_kind(k);
			argc--; argv++;
			break;
		}
		argc--; argv++;
	}
	if (k[0]) // 设置qdisc名字属性
		addattr_l(&req.n, sizeof(req), TCA_KIND, k, strlen(k)+1);
	if (est.ewma_log)
		addattr_l(&req.n, sizeof(req), TCA_RATE, &est, sizeof(est));

	if (q) {
	    // 让具体的qdisc实现解析其特定参数
		if (q->parse_qopt) {
			if (q->parse_qopt(q, argc, argv, &req.n))
				return 1;
		} else if (argc) {
			fprintf(stderr, "qdisc '%s' does not support option parsing\n", k);
			return -1;
		}
	} else {
	    // 没有指定要操作的qdisc,失败处理
		if (argc) {
			if (matches(*argv, "help") == 0)
				usage();
			fprintf(stderr, "Garbage instead of arguments \"%s ...\". Try \"tc qdisc help\".\n", *argv);
			return -1;
		}
	}

	if (check_size_table_opts(&stab.szopts)) {
		struct rtattr *tail;
		if (tc_calc_size_table(&stab.szopts, &stab.data) < 0) {
			fprintf(stderr, "failed to calculate size table.\n");
			return -1;
		}
		tail = NLMSG_TAIL(&req.n);
		addattr_l(&req.n, sizeof(req), TCA_STAB, NULL, 0);
		addattr_l(&req.n, sizeof(req), TCA_STAB_BASE, &stab.szopts, sizeof(stab.szopts));
		if (stab.data)
			addattr_l(&req.n, sizeof(req), TCA_STAB_DATA, stab.data,
				  stab.szopts.tsize * sizeof(__u16));
		tail->rta_len = (void *)NLMSG_TAIL(&req.n) - (void *)tail;
		if (stab.data)
			free(stab.data);
	}

	if (d[0])  { // 根据网卡名字获取要操作的网络设备索引
		int idx;

		ll_init_map(&rth);
		idx = ll_name_to_index(d);
		if (idx == 0) {
			fprintf(stderr, "Cannot find device \"%s\"\n", d);
			return 1;
		}
		req.t.tcm_ifindex = idx;
	}
    // 和内核交互
	if (rtnl_talk(&rth, &req.n, NULL, 0) < 0)
		return 2;
	return 0;
}

新建&修改qdisc: tc_modify_qdisc()

内核对RTM_NEWQDISC命令的处理。

static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
{
	struct net *net = sock_net(skb->sk);
	struct tcmsg *tcm;
	struct nlattr *tca[TCA_MAX + 1]; // 保存用户态设置的属性
	struct net_device *dev;
	u32 clid; // class id的缩写,代表qdisc的parent
	struct Qdisc *q, *p;
	int err;

	if (net != &init_net)
		return -EINVAL;

replay:
	/* Reinit, just in case something touches this. */
	// 解析Netlink消息,找到tc配置信息
	tcm = NLMSG_DATA(n);
	clid = tcm->tcm_parent; // parent class的ID,0表示不指定父排队规则
	q = p = NULL;
    // 找到要配置的网络设备对象
	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
		return -ENODEV;

    // 解析Netlink消息属性,保存到tca数组中
	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
	if (err < 0)
		return err;

    // 按照是否指定了parent class ID分两种情况处理:
	// 1. 如果没有指定,那么必须指定tcm_handle,表示要修改网络设备上的根qdisc;
	// 2. 如果指定了,表示要修改的qdisc为属于某个类
	// 该值通常都非0,因为即使要修改根qdisc,会指定root参数,此时clid 就是TC_H_ROOT
	if (clid) {
	    // 根据parent class ID,找到要修改的qdisc
		if (clid != TC_H_ROOT) { // 父类不是根qdisc
			if (clid != TC_H_INGRESS) {
			    // 用class id的主号码先查询到父类对应的qdisc(就是父类qdisc的handle,
			    // 这是由命名规则决定的)
				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
					return -ENOENT;
				// 这种情况要修改的qdisc一定是class的叶子qdisc,否则是种错误
				q = qdisc_leaf(p, clid);
			} else { // 要修改的是接收队列qdisc
				q = dev->rx_queue.qdisc_sleeping;
			}
		} else {
		    // 指定要修改的就是根qdisc,直接取队列的qdisc即可。
		    // 从实现来看实现说明根qdisc必须属于发送队列
			struct netdev_queue *dev_queue;
			dev_queue = netdev_get_tx_queue(dev, 0);
			q = dev_queue->qdisc_sleeping;
		}

		/* It may be default qdisc, ignore it */
		if (q && q->handle == 0)
			q = NULL;
        // 1. q为空表示当前配置的qdisc为默认qdisc pfifo_fast(add场景);
        // 2. tcm->tcm_handle为0表示要让内核分配句柄并且要作为根qdisc(add场景);
        // 3. q->handle != tcm->tcm_handle表示要修改指定的qdisc(replace场景)
		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
			if (tcm->tcm_handle) {
				if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
					return -EEXIST;
				if (TC_H_MIN(tcm->tcm_handle))
					return -EINVAL;
				if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
					goto create_n_graft;
				if (n->nlmsg_flags & NLM_F_EXCL)
					return -EEXIST;
				if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
					return -EINVAL;
				if (q == p || (p && check_loop(q, p, 0)))
					return -ELOOP;
				atomic_inc(&q->refcnt);
				goto graft;
			} else {
				if (q == NULL)
					goto create_n_graft;

				/* This magic test requires explanation.
				 *
				 *   We know, that some child q is already
				 *   attached to this parent and have choice:
				 *   either to change it or to create/graft new one.
				 *
				 *   1. We are allowed to create/graft only
				 *   if CREATE and REPLACE flags are set.
				 *
				 *   2. If EXCL is set, requestor wanted to say,
				 *   that qdisc tcm_handle is not expected
				 *   to exist, so that we choose create/graft too.
				 *
				 *   3. The last case is when no flags are set.
				 *   Alas, it is sort of hole in API, we
				 *   cannot decide what to do unambiguously.
				 *   For now we select create/graft, if
				 *   user gave KIND, which does not match existing.
				 */
				if ((n->nlmsg_flags&NLM_F_CREATE) &&
				    (n->nlmsg_flags&NLM_F_REPLACE) &&
				    ((n->nlmsg_flags&NLM_F_EXCL) ||
				     (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))))
					goto create_n_graft;
			}
		}
	} else {
	    // 必须指定句柄以表明要修改的qdisc
		if (!tcm->tcm_handle)
			return -EINVAL;
		// 从设备当前配置的收发队列的qdisc中找到要修改的qdisc对象
		q = qdisc_lookup(dev, tcm->tcm_handle);
	}

	/* Change qdisc parameters 对应change命令 */
	// 没有找到要修改的qdisc
	if (q == NULL)
		return -ENOENT;
	if (n->nlmsg_flags & NLM_F_EXCL)
		return -EEXIST;
	// 句柄和名字不匹配
	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
		return -EINVAL;
	// 修改qdisc的配置参数
	err = qdisc_change(q, tca);
	if (err == 0) // 发布修改通知给用户态调用者(Netlink消息)
		qdisc_notify(skb, n, clid, NULL, q);
	return err;

create_n_graft:
    // 没有指定create标记,视为参数错误
	if (!(n->nlmsg_flags & NLM_F_CREATE))
		return -ENOENT;
	if (clid == TC_H_INGRESS)
	    // 为接收队列新建并关联一个qdisc
		q = qdisc_create(dev, &dev->rx_queue, tcm->tcm_parent, tcm->tcm_parent, tca, &err);
	else
	    // 为发送队列新建并关联一个排队规则
		q = qdisc_create(dev, netdev_get_tx_queue(dev, 0), tcm->tcm_parent, tcm->tcm_handle, tca, &err);
	if (q == NULL) {
		if (err == -EAGAIN) // 重新走一遍上述流程
			goto replay;
		return err;
	}

graft:
    // 用新的qdisc替换老的qdisc
	err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
	if (err) {
		if (q)
			qdisc_destroy(q);
		return err;
	}
	return 0;
}

创建qdisc对象: qdisc_create()

@dev、dev_queue:关联的设备和队列;
@parent: 新建qdisc的parent class ID,对于根qdisc,该值应该是TC_H_ROOT;
@handle: 新建qdisc的句柄,如果为0,则由内核自动分配一个;
@tca: 新建qdisc的参数;
@errp: 返回值
static struct Qdisc *qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
	     u32 parent, u32 handle, struct nlattr **tca, int *errp)
{
	int err;
	struct nlattr *kind = tca[TCA_KIND]; // 新建qdisc的名称,内核根据名称查找全局qdisc_base表,找到对应的模块
	struct Qdisc *sch;
	struct Qdisc_ops *ops;
	struct qdisc_size_table *stab;

    // 查找全局qdisc操作集表,找到对应的qdisc的操作函数集,注册一个新的qdisc本质上是向内核注册一个qdisc_ops对象
	ops = qdisc_lookup_ops(kind);
	err = -ENOENT;
	if (ops == NULL)
		goto err_out;

    // 分配一个qdisc对象
	sch = qdisc_alloc(dev_queue, ops);
	if (IS_ERR(sch)) {
		err = PTR_ERR(sch);
		goto err_out2;
	}
    // 保存parent class的句柄
	sch->parent = parent;
    // 设置新qdisc的句柄
	if (handle == TC_H_INGRESS) {
		sch->flags |= TCQ_F_INGRESS;
		handle = TC_H_MAKE(TC_H_INGRESS, 0);
		lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
	} else {
	    // 调用者未指定则由系统分配一个
		if (handle == 0) {
			handle = qdisc_alloc_handle(dev);
			err = -ENOMEM;
			if (handle == 0)
				goto err_out3;
		}
		lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
	}
	sch->handle = handle;
    // 调用qdisc_ops的初始化函数
	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
		if (tca[TCA_STAB]) {
			stab = qdisc_get_stab(tca[TCA_STAB]);
			if (IS_ERR(stab)) {
				err = PTR_ERR(stab);
				goto err_out3;
			}
			sch->stab = stab;
		}
		// 处理rate属性
		if (tca[TCA_RATE]) {
			spinlock_t *root_lock;
			if ((sch->parent != TC_H_ROOT) && !(sch->flags & TCQ_F_INGRESS))
				root_lock = qdisc_root_sleeping_lock(sch);
			else
				root_lock = qdisc_lock(sch);
            // 根据rate参数计算qdisc对象的bstats和rate_est参数
			err = gen_new_estimator(&sch->bstats, &sch->rate_est, root_lock, tca[TCA_RATE]);
			if (err) {
				/*
				 * Any broken qdiscs that would require
				 * a ops->reset() here? The qdisc was never
				 * in action so it shouldn't be necessary.
				 */
				if (ops->destroy)
					ops->destroy(sch);
				goto err_out3;
			}
		}
		qdisc_list_add(sch);
		return sch;
	}
	// 错误处理
err_out3:
	qdisc_put_stab(sch->stab);
	dev_put(dev);
	kfree((char *) sch - sch->padded);
err_out2:
	module_put(ops->owner);
err_out:
	*errp = err;
	return NULL;
}

关联新的qdisc: qdisc_graft()

该函数将新的qdisc关联到设备(作为设备的根qdisc),或者关联到指定的parent class(用classid指定),并且释放旧的qdisc(old参数指定,适用于替换场景)。

/* Graft qdisc "new" to class "classid" of qdisc "parent" or
 * to device "dev".
 *
 * When appropriate send a netlink notification using 'skb'
 * and "n".
 *
 * On success, destroy old qdisc.
 */
static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, struct sk_buff *skb,
    struct nlmsghdr *n, u32 classid, struct Qdisc *new, struct Qdisc *old)
{
	struct Qdisc *q = old;
	int err = 0;

	if (parent == NULL) {
	    // 将新的qdisc设置为设备队列的根qdisc
		unsigned int i, num_q, ingress;

		ingress = 0;
		num_q = dev->num_tx_queues;
		// 输入队列时调整参数
		if ((q && q->flags & TCQ_F_INGRESS) || (new && new->flags & TCQ_F_INGRESS)) {
			num_q = 1;
			ingress = 1;
		}
        // 关闭设备
		if (dev->flags & IFF_UP)
			dev_deactivate(dev);

		for (i = 0; i < num_q; i++) {
		    // 找到要操作的设备队列
			struct netdev_queue *dev_queue = &dev->rx_queue;
			if (!ingress)
				dev_queue = netdev_get_tx_queue(dev, i);
            // jiang网络设备级别的qdisc关联
			old = dev_graft_qdisc(dev_queue, new);
			if (new && i > 0)
				atomic_inc(&new->refcnt);
            // 向用户态调用者发送Netlink通知消息
			notify_and_destroy(skb, n, classid, old, new);
		}
        // 重新使能设备
		if (dev->flags & IFF_UP)
			dev_activate(dev);
	} else {
	    // 将新的qdisc与parent qdisc关联

	    // 找到parent qdisc的类操作函数集
		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
		err = -EINVAL;
		if (cops) {
		    // 调用类操作集的graft()回调完成关联
			unsigned long cl = cops->get(parent, classid);
			if (cl) {
			    // 注意参数: parent为父qdisc,new为要关联的新的孩子qdisc,
			    // cl参数用于让class标识要替换的孩子,old保存父qdisc原来的孩子指针(出参)
				err = cops->graft(parent, cl, new, &old);
				cops->put(parent, cl);
			}
		}
		if (!err)
		    // 向用户态调用者发送Netlink通知消息
			notify_and_destroy(skb, n, classid, old, new);
	}
	return err;
}
dev_graft_qdisc()

将qdisc指定为队列的根qdisc,并返回队列原来的根qdisc。

/* Attach toplevel qdisc to device queue. */
static struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, struct Qdisc *qdisc)
{
	struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
	spinlock_t *root_lock;

	root_lock = qdisc_lock(oqdisc);
	spin_lock_bh(root_lock);

	// 复位旧的qdisc
	if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
		qdisc_reset(oqdisc);

	// 将新的qdisc指定为队列新的qdisc
	if (qdisc == NULL)
	    // 该逻辑可以使得当传入的qdisc为NULL时,可以实现删除qdisc的效果
		qdisc = &noop_qdisc;
	dev_queue->qdisc_sleeping = qdisc;
	rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);

	spin_unlock_bh(root_lock);
    // 返回队列原来的根qdisc
	return oqdisc;
}

修改qdisc配置: qdisc_change()

qdisc_change()用于变更某个qdisc的配置参数。

static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
{
	struct qdisc_size_table *stab = NULL;
	int err = 0;

	if (tca[TCA_OPTIONS]) {
		if (sch->ops->change == NULL)
			return -EINVAL;
		// 调用qdisc的change()回调
		err = sch->ops->change(sch, tca[TCA_OPTIONS]);
		if (err)
			return err;
	}

	if (tca[TCA_STAB]) {
		stab = qdisc_get_stab(tca[TCA_STAB]);
		if (IS_ERR(stab))
			return PTR_ERR(stab);
	}

	qdisc_put_stab(sch->stab);
	sch->stab = stab;

	if (tca[TCA_RATE])
		/* NB: ignores errors from replace_estimator
		   because change can't be undone. */
		gen_replace_estimator(&sch->bstats, &sch->rate_est,
					    qdisc_root_sleeping_lock(sch),
					    tca[TCA_RATE]);
	return 0;
}

获取&删除qdisc: tc_get_qdisc()

static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
{
	struct net *net = sock_net(skb->sk);
	struct tcmsg *tcm = NLMSG_DATA(n); // 指向tc消息开始
	struct nlattr *tca[TCA_MAX + 1];
	struct net_device *dev;
	u32 clid = tcm->tcm_parent; // parent class索引
	struct Qdisc *q = NULL;
	struct Qdisc *p = NULL;
	int err;

	if (net != &init_net)
		return -EINVAL;
    // 根据索引找到网络设备
	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
		return -ENODEV;
    // 解析Netlink消息参数,保存到tca中
	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
	if (err < 0)
		return err;

    // 和tc_modify_qdisc()中的逻辑类似,都是为了找到要操作的qdisc
	if (clid) {
		if (clid != TC_H_ROOT) {
			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
					return -ENOENT;
				q = qdisc_leaf(p, clid);
			} else { /* ingress */
				q = dev->rx_queue.qdisc_sleeping;
			}
		} else {
			struct netdev_queue *dev_queue;
			dev_queue = netdev_get_tx_queue(dev, 0);
			q = dev_queue->qdisc_sleeping;
		}
		// 搜索失败,没有此qdisc
		if (!q)
			return -ENOENT;
        // 句柄要匹配
		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
			return -EINVAL;
	} else {
		if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
			return -ENOENT;
	}
    // 检查qdisc的名字是否匹配
	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
		return -EINVAL;

    // 根据消息类型决定是删除qdisc,还是查询qdisc
	if (n->nlmsg_type == RTM_DELQDISC) {
		if (!clid)
			return -EINVAL;
		if (q->handle == 0)
			return -ENOENT;
		if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
			return err;
	} else {
		qdisc_notify(skb, n, clid, NULL, q);
	}
	return 0;
}

class的Netlink接口

从上面看到,class的创建、删除以及查询在内核都是同一个接口tc_ctl_tclass(),不过还是先来看看用户态tc(8)的class命令。

tc命令行格式

tc [ OPTIONS ] class [ add | change | replace | delete ] dev DEV parent qdisc-id [ classid class-id ] \
   qdisc [ qdisc specific parameters ]
  • parent参数指定class的parent,可以是qdisc的句柄,也可以是其它class句柄;
  • classid参数指定要操作的class的句柄;
  • qdisc参数指定操作的class是属于哪个qdisc,指定的是qdisc的名字;

注意: class一定是作用于某个qdisc的,其具体的参数需要参考qdisc的使用指导。

用户态class命令: do_class()

和do_qdisc()非常的像,我们重点看tc_class_modify()的处理。

int do_class(int argc, char **argv)
{
	if (argc < 1)
		return tc_class_list(0, NULL);
	if (matches(*argv, "add") == 0)
		return tc_class_modify(RTM_NEWTCLASS, NLM_F_EXCL|NLM_F_CREATE, argc-1, argv+1);
	if (matches(*argv, "change") == 0)
		return tc_class_modify(RTM_NEWTCLASS, 0, argc-1, argv+1);
	if (matches(*argv, "replace") == 0)
		return tc_class_modify(RTM_NEWTCLASS, NLM_F_CREATE, argc-1, argv+1);
	if (matches(*argv, "delete") == 0)
		return tc_class_modify(RTM_DELTCLASS, 0,  argc-1, argv+1);
#if 0
	if (matches(*argv, "get") == 0)
		return tc_class_get(RTM_GETTCLASS, 0,  argc-1, argv+1);
#endif
	if (matches(*argv, "list") == 0 || matches(*argv, "show") == 0
	    || matches(*argv, "lst") == 0)
		return tc_class_list(argc-1, argv+1);
	if (matches(*argv, "help") == 0) {
		usage();
		return 0;
	}
	fprintf(stderr, "Command \"%s\" is unknown, try \"tc class help\".\n", *argv);
	return -1;
}

修改类处理命令: tc_class_modify()

static int tc_class_modify(int cmd, unsigned int flags, int argc, char **argv)
{
	struct {
		struct nlmsghdr	n;
		struct tcmsg		t;
		char			buf[4096];
	} req = {
		.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)),
		.n.nlmsg_flags = NLM_F_REQUEST | flags,
		.n.nlmsg_type = cmd,
		.t.tcm_family = AF_UNSPEC,
	};
	struct qdisc_util *q = NULL;
	struct tc_estimator est = {};
	char  d[16] = {};
	char  k[16] = {};

	while (argc > 0) {
		if (strcmp(*argv, "dev") == 0) { // 解析网络设备参数
			NEXT_ARG();
			if (d[0])
				duparg("dev", *argv);
			strncpy(d, *argv, sizeof(d)-1);
		} else if (strcmp(*argv, "classid") == 0) { // 解析class ID参数
			__u32 handle;
			NEXT_ARG();
			if (req.t.tcm_handle)
				duparg("classid", *argv);
			if (get_tc_classid(&handle, *argv))
				invarg("invalid class ID", *argv);
			req.t.tcm_handle = handle;
		} else if (strcmp(*argv, "handle") == 0) {
			fprintf(stderr, "Error: try \"classid\" instead of \"handle\"\n");
			return -1;
		} else if (strcmp(*argv, "root") == 0) {
			if (req.t.tcm_parent) {
				fprintf(stderr, "Error: \"root\" is duplicate parent ID.\n");
				return -1;
			}
			req.t.tcm_parent = TC_H_ROOT;
		} else if (strcmp(*argv, "parent") == 0) { // 解析parent ID
			__u32 handle;
			NEXT_ARG();
			if (req.t.tcm_parent)
				duparg("parent", *argv);
			if (get_tc_classid(&handle, *argv))
				invarg("invalid parent ID", *argv);
			// 根class的parent是qdisc,其它class的parent是某个class
			req.t.tcm_parent = handle;
		} else if (matches(*argv, "estimator") == 0) {
			if (parse_estimator(&argc, &argv, &est))
				return -1;
		} else if (matches(*argv, "help") == 0) {
			usage();
		} else {
		    // 根据class的名字找到class实现
			strncpy(k, *argv, sizeof(k)-1);
			q = get_qdisc_kind(k);
			argc--; argv++;
			break;
		}
		argc--; argv++;
	}

	if (k[0])
		addattr_l(&req.n, sizeof(req), TCA_KIND, k, strlen(k)+1);
	if (est.ewma_log)
		addattr_l(&req.n, sizeof(req), TCA_RATE, &est, sizeof(est));

	if (q) {
	    // 调用特定qdisc的class参数解析接口解析特定参数
		if (q->parse_copt == NULL) {
			fprintf(stderr, "Error: Qdisc \"%s\" is classless.\n", k);
			return 1;
		}
		if (q->parse_copt(q, argc, argv, &req.n))
			return 1;
	} else {
		if (argc) {
			if (matches(*argv, "help") == 0)
				usage();
			fprintf(stderr, "Garbage instead of arguments \"%s ...\". Try \"tc class help\".", *argv);
			return -1;
		}
	}

	if (d[0])  { // 将网络设备名称转换为网络设备索引
		ll_init_map(&rth);
		if ((req.t.tcm_ifindex = ll_name_to_index(d)) == 0) {
			fprintf(stderr, "Cannot find device \"%s\"\n", d);
			return 1;
		}
	}
    // 和内核通信
	if (rtnl_talk(&rth, &req.n, NULL, 0) < 0)
		return 2;
	return 0;
}

增删改查class: tc_ctl_tclass()

static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
{
	struct net *net = sock_net(skb->sk);
	struct netdev_queue *dev_queue;
	struct tcmsg *tcm = NLMSG_DATA(n); // Netlink消息
	struct nlattr *tca[TCA_MAX + 1];
	struct net_device *dev;
	struct Qdisc *q = NULL;
	const struct Qdisc_class_ops *cops;
	unsigned long cl = 0;
	unsigned long new_cl;
	u32 pid = tcm->tcm_parent; // parent句柄,可能是qdisc的句柄,也可能是某个class的句柄
	u32 clid = tcm->tcm_handle; // 本class的句柄
	u32 qid = TC_H_MAJ(clid); // 该class要归属的qdisc的句柄
	int err;

	if (net != &init_net)
		return -EINVAL;
    // 找到对应的网络设备
	if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
		return -ENODEV;
    // 解析Netlink消息携带的属性参数
	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
	if (err < 0)
		return err;

	/*
	   parent == TC_H_UNSPEC - unspecified parent.
	   parent == TC_H_ROOT   - class is root, which has no parent.
	   parent == X:0	 - parent is root class.
	   parent == X:Y	 - parent is a node in hierarchy.
	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.

	   handle == 0:0	 - generate handle from kernel pool.
	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
	   handle == X:Y	 - clear.
	   handle == X:0	 - root class.
	 */

	/* Step 1. Determine qdisc handle X:0 */
	dev_queue = netdev_get_tx_queue(dev, 0);
	if (pid != TC_H_ROOT) {
	    // 校验并修正pid和qid的值
		u32 qid1 = TC_H_MAJ(pid);
		if (qid && qid1) {
			/* If both majors are known, they must be identical. */
			if (qid != qid1)
				return -EINVAL;
		} else if (qid1) {
			qid = qid1;
		} else if (qid == 0)
			qid = dev_queue->qdisc_sleeping->handle;
		/* Now qid is genuine qdisc handle consistent
		   both with parent and child.

		   TC_H_MAJ(pid) still may be unspecified, complete it now.
		 */
		if (pid)
			pid = TC_H_MAKE(qid, pid);
	} else {
	    // 未指定属于哪个qdisc,默认对应根qdisc
		if (qid == 0)
			qid = dev_queue->qdisc_sleeping->handle;
	}

	/* OK. Locate qdisc 根据句柄找到qdisc对象,必须已经存在*/
	if ((q = qdisc_lookup(dev, qid)) == NULL)
		return -ENOENT;

	// 如果qdisc没有提供类操作集,那么该qdisc一定是无类qdisc,它是不支持关联class的
	cops = q->ops->cl_ops;
	if (cops == NULL)
		return -EINVAL;

	/* Now try to get class 用户态传入的classid实际上只有次号码 */
	if (clid == 0) {
		if (pid == TC_H_ROOT)
			clid = qid;
	} else
		clid = TC_H_MAKE(qid, clid);

    // 调用qdisc类操作集的get()回调,根据clid获取class对象,用参数(cl标识,作为后续回调的入参)
    // 如果class已经存在,预期get()要增加class对象的引用计数并返回其指针等可以标识该对象的信息cl
    // 如果class不存在,则需要返回0,即NULL
	if (clid)
		cl = cops->get(q, clid);

	if (cl == 0) { // class不存在,可能是新建场景
	    // 非创建命令,此时没有找到对应的class,属于失败场景
		err = -ENOENT;
		if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
			goto out;
	} else { // 获取class成功
		switch (n->nlmsg_type) {
		case RTM_NEWTCLASS:
		    // class已经存在,但是指定了互斥标记,失败
			err = -EEXIST;
			if (n->nlmsg_flags&NLM_F_EXCL)
				goto out;
			break;
		case RTM_DELTCLASS:
		    // 删除class
			err = cops->delete(q, cl);
			if (err == 0)
				tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
			goto out;
		case RTM_GETTCLASS:
		    // 获取class,返回
			err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
			goto out;
		default:
			err = -EINVAL;
			goto out;
		}
	}
    // 修改qdisc(q)下的某个class(clid),如果该class不存在,则新建它
	new_cl = cl;
	err = cops->change(q, clid, pid, tca, &new_cl);
	if (err == 0)
		tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);

out:
    // 释放引用
	if (cl)
		cops->put(q, cl);
	return err;
}

filter的neilink接口

如初始化所示,filter的增加、删除、修改都是tc_ctl_tfilter()函数完成,同样先看用户态tc(8)的filter命令实现。

tc命令行格式

# 一行命令格式如下:
 tc [ OPTIONS ] filter [ add | change | replace | delete | get ] dev DEV [ parent qdisc-id | root ] \
    [ handle filter-id ] protocol protocol prio priority filtertype [ filtertype specific parameters ] \
    flowid flow-id
  • handle参数指定filter的句柄;不同filter对该字段的解释不同;
  • protocol参数指定协议,取值ETH_P_IP等值,表示数据包所属的三层报文;
  • prio参数指定filter优先级,值越小,优先级越高,匹配时优先级高的会优先匹配;
  • filtertype参数为具体的filter名字,如u32等;

用户态filter命令: do_filter()

重点看tc_filter_modify()的处理。

int do_filter(int argc, char **argv)
{
	if (argc < 1)
		return tc_filter_list(0, NULL);
	if (matches(*argv, "add") == 0)
		return tc_filter_modify(RTM_NEWTFILTER, NLM_F_EXCL|NLM_F_CREATE, argc-1, argv+1);
	if (matches(*argv, "change") == 0)
		return tc_filter_modify(RTM_NEWTFILTER, 0, argc-1, argv+1);
	if (matches(*argv, "replace") == 0)
		return tc_filter_modify(RTM_NEWTFILTER, NLM_F_CREATE, argc-1, argv+1);
	if (matches(*argv, "delete") == 0)
		return tc_filter_modify(RTM_DELTFILTER, 0,  argc-1, argv+1);
	if (matches(*argv, "get") == 0)
		return tc_filter_get(RTM_GETTFILTER, 0,  argc-1, argv+1);
	if (matches(*argv, "list") == 0 || matches(*argv, "show") == 0 || matches(*argv, "lst") == 0)
		return tc_filter_list(argc-1, argv+1);
	if (matches(*argv, "help") == 0) {
		usage();
		return 0;
	}
	fprintf(stderr, "Command \"%s\" is unknown, try \"tc filter help\".\n", *argv);
	return -1;
}

修改类处理命令: tc_filter_modify()

static int tc_filter_modify(int cmd, unsigned int flags, int argc, char **argv)
{
	struct {
		struct nlmsghdr	n;
		struct tcmsg		t;
		char			buf[MAX_MSG];
	} req = {
		.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)),
		.n.nlmsg_flags = NLM_F_REQUEST | flags,
		.n.nlmsg_type = cmd,
		.t.tcm_family = AF_UNSPEC,
	};
	struct filter_util *q = NULL;
	__u32 prio = 0; // 默认优先级为0
	__u32 protocol = 0; // 默认协议为0
	int protocol_set = 0;
	__u32 chain_index;
	int chain_index_set = 0;
	char *fhandle = NULL;
	char  d[16] = {};
	char  k[16] = {};
	struct tc_estimator est = {};

    // 新曾filter的默认协议为ALL
	if (cmd == RTM_NEWTFILTER && flags & NLM_F_CREATE)
		protocol = htons(ETH_P_ALL);

	while (argc > 0) {
		if (strcmp(*argv, "dev") == 0) { // 解析网络设备名称
			NEXT_ARG();
			if (d[0])
				duparg("dev", *argv);
			strncpy(d, *argv, sizeof(d)-1);
		} else if (strcmp(*argv, "root") == 0) { // filter关联到根qdisc
			if (req.t.tcm_parent) {
				fprintf(stderr, "Error: \"root\" is duplicate parent ID\n");
				return -1;
			}
			req.t.tcm_parent = TC_H_ROOT;
		} else if (strcmp(*argv, "ingress") == 0) { // filter关联到入口队列qdisc
			if (req.t.tcm_parent) {
				fprintf(stderr, "Error: \"ingress\" is duplicate parent ID\n");
				return -1;
			}
			req.t.tcm_parent = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS);
		} else if (strcmp(*argv, "egress") == 0) {
			if (req.t.tcm_parent) {
				fprintf(stderr, "Error: \"egress\" is duplicate parent ID\n");
				return -1;
			}
			req.t.tcm_parent = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS);
		} else if (strcmp(*argv, "parent") == 0) { // 指定关联到哪个qdisc或者class
			__u32 handle;
			NEXT_ARG();
			if (req.t.tcm_parent)
				duparg("parent", *argv);
			if (get_tc_classid(&handle, *argv))
				invarg("Invalid parent ID", *argv);
			req.t.tcm_parent = handle;
		} else if (strcmp(*argv, "handle") == 0) { // 指定filter句柄,句柄值的解析让具体filter去执行
			NEXT_ARG();
			if (fhandle)
				duparg("handle", *argv);
			fhandle = *argv;
		} else if (matches(*argv, "preference") == 0 || matches(*argv, "priority") == 0) { // 优先级
			NEXT_ARG();
			if (prio)
				duparg("priority", *argv);
			if (get_u32(&prio, *argv, 0) || prio > 0xFFFF)
				invarg("invalid priority value", *argv);
		} else if (matches(*argv, "protocol") == 0) { // 设置协议,取值见ETH_IP系列值
			__u16 id;
			NEXT_ARG();
			if (protocol_set)
				duparg("protocol", *argv);
			if (ll_proto_a2n(&id, *argv))
				invarg("invalid protocol", *argv);
			protocol = id;
			protocol_set = 1;
		} else if (matches(*argv, "chain") == 0) {
			NEXT_ARG();
			if (chain_index_set)
				duparg("chain", *argv);
			if (get_u32(&chain_index, *argv, 0))
				invarg("invalid chain index value", *argv);
			chain_index_set = 1;
		} else if (matches(*argv, "estimator") == 0) {
			if (parse_estimator(&argc, &argv, &est) < 0)
				return -1;
		} else if (matches(*argv, "help") == 0) {
			usage();
			return 0;
		} else {
		    // filter名称
			strncpy(k, *argv, sizeof(k)-1);
			q = get_filter_kind(k);
			argc--; argv++;
			break;
		}
		argc--; argv++;
	}
    // tcm_info的高16位是优先级,低16位是协议号
	req.t.tcm_info = TC_H_MAKE(prio<<16, protocol);

	if (chain_index_set)
		addattr32(&req.n, sizeof(req), TCA_CHAIN, chain_index);
	if (k[0]) // 设置filter名称属性
		addattr_l(&req.n, sizeof(req), TCA_KIND, k, strlen(k)+1);

	if (q) {
	    // 调用具体的filter的参数解析回调
		if (q->parse_fopt(q, fhandle, argc, argv, &req.n))
			return 1;
	} else {
		if (fhandle) {
			fprintf(stderr, "Must specify filter type when using \"handle\"\n");
			return -1;
		}
		if (argc) {
			if (matches(*argv, "help") == 0)
				usage();
			fprintf(stderr, "Garbage instead of arguments \"%s ...\". Try \"tc filter help\".\n", *argv);
			return -1;
		}
	}
	if (est.ewma_log)
		addattr_l(&req.n, sizeof(req), TCA_RATE, &est, sizeof(est));

	if (d[0])  { // 设置网络设备索引
		ll_init_map(&rth);
		req.t.tcm_ifindex = ll_name_to_index(d);
		if (req.t.tcm_ifindex == 0) {
			fprintf(stderr, "Cannot find device \"%s\"\n", d);
			return 1;
		}
	}
    // 和内核通信
	if (rtnl_talk(&rth, &req.n, NULL, 0) < 0) {
		fprintf(stderr, "We have an error talking to the kernel\n");
		return 2;
	}
	return 0;
}

内核态增删改查filter: tc_ctl_tfilter()

static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
{
	struct net *net = sock_net(skb->sk);
	struct nlattr *tca[TCA_MAX + 1];
	spinlock_t *root_lock;
	struct tcmsg *t;
	u32 protocol;
	u32 prio;
	u32 nprio;
	u32 parent;
	struct net_device *dev;
	struct Qdisc  *q;
	struct tcf_proto **back, **chain;
	struct tcf_proto *tp;
	struct tcf_proto_ops *tp_ops;
	const struct Qdisc_class_ops *cops;
	unsigned long cl;
	unsigned long fh;
	int err;

	if (net != &init_net)
		return -EINVAL;

replay:
	t = NLMSG_DATA(n);
	// 提取优先级和协议字段
	protocol = TC_H_MIN(t->tcm_info);
	prio = TC_H_MAJ(t->tcm_info);
	nprio = prio;
	parent = t->tcm_parent;
	cl = 0;

	if (prio == 0) { // prio为0表示要内核指定一个默认优先级
		/* If no priority is given, user wants we allocated it. */
		if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE))
			return -ENOENT;
		prio = TC_H_MAKE(0x80000000U, 0U);
	}

	// 找到对应的网络设备对象
	dev = __dev_get_by_index(&init_net, t->tcm_ifindex);
	if (dev == NULL)
		return -ENODEV;
    // 解析Netlink消息数据
	err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL);
	if (err < 0)
		return err;

	// 找到qdisc
	if (!parent) {
	    // 未指定parent,那么默认将filter关联到根qdisc
		struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, 0);
		q = dev_queue->qdisc_sleeping;
		parent = q->handle;
	} else {
	    // 无论parent为qdisc或者class的句柄,其主号码肯定都是qdisc的句柄
		q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent));
		if (q == NULL)
			return -EINVAL;
	}
	// 无类qdisc是不允许关联filter的,没有意义,返回失败
	if ((cops = q->ops->cl_ops) == NULL)
		return -EINVAL;

	// parent的次号码不为0,说明filter关联的是一个class,找到该class对象
	if (TC_H_MIN(parent)) {
		cl = cops->get(q, parent);
		if (cl == 0)
			return -ENOENT;
	}

	// 获取qdisc或其class的filter链表,注意,如果关联的是class,那么cl参数已经是该class对象了
	chain = cops->tcf_chain(q, cl);
	err = -EINVAL;
	if (chain == NULL)
		goto errout;

	/* Check the chain for existence of proto-tcf with this priority */
	// 查找filter链表,这次遍历有两个作用:
	// 1. 找到要操作的filter(可能没有),并且找到要将其插入的位置,prio值小的排在开头
	// 2. 优先级相同且协议相同的filter只能关联一个
	// 从这里的逻辑可以看出,filter的句柄并不是用来标识filter对象的,内核是先从filter的parent
	// 找到filter链表,然后从中找到优先级相同且协议号相同的filter
	for (back = chain; (tp=*back) != NULL; back = &tp->next) {
		if (tp->prio >= prio) {
			if (tp->prio == prio) {
				if (!nprio || (tp->protocol != protocol && protocol))
					goto errout;
			} else
				tp = NULL;
			break;
		}
	}

	root_lock = qdisc_root_sleeping_lock(q);
    // 同一个filter链表下,相同优先级且协议号也相同的filter内容会被组织到同一个tcf_proto对象中
	if (tp == NULL) {
		/* Proto-tcf does not exist, create new one */
        // 此时必须指定名字和协议
		if (tca[TCA_KIND] == NULL || !protocol)
			goto errout;
        // 新建参数不合法
		err = -ENOENT;
		if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE))
			goto errout;


		/* Create new proto tcf */
        // 分配tcf_proto对象并对其进行初始化
		err = -ENOBUFS;
		tp = kzalloc(sizeof(*tp), GFP_KERNEL);
		if (tp == NULL)
			goto errout;
		err = -ENOENT;
	    // 根据filter名字找到filter操作集
		tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND]);
		if (tp_ops == NULL) { // 尝试加载该match模块部分代码忽略
            ...
			kfree(tp);
			goto errout;
		}
		tp->ops = tp_ops;
		tp->protocol = protocol;
		tp->prio = nprio ? : tcf_auto_prio(*back); // 可以让内核自动选一个优先级
		tp->q = q;
		tp->classify = tp_ops->classify;
		tp->classid = parent;
        // 调用filter操作集的init()回调
		err = tp_ops->init(tp);
		if (err != 0) {
			module_put(tp_ops->owner);
			kfree(tp);
			goto errout;
		}
        // 将新建的filter插入到filter链表中
		spin_lock_bh(root_lock);
		tp->next = *back;
		*back = tp;
		spin_unlock_bh(root_lock);

	} else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind))
		goto errout;

    // 调用filter操作集get()回调获取filter
	fh = tp->ops->get(tp, t->tcm_handle);
	if (fh == 0) {
		if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
			spin_lock_bh(root_lock);
			*back = tp->next;
			spin_unlock_bh(root_lock);
            // 删除filter
			tfilter_notify(skb, n, tp, fh, RTM_DELTFILTER);
			tcf_destroy(tp);
			err = 0;
			goto errout;
		}
		err = -ENOENT;
		if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags & NLM_F_CREATE))
			goto errout;
	} else {
		switch (n->nlmsg_type) {
		case RTM_NEWTFILTER:
			err = -EEXIST;
			if (n->nlmsg_flags & NLM_F_EXCL)
				goto errout;
			break;
		case RTM_DELTFILTER:
			err = tp->ops->delete(tp, fh);
			if (err == 0)
				tfilter_notify(skb, n, tp, fh, RTM_DELTFILTER);
			goto errout;
		case RTM_GETTFILTER:
			err = tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER);
			goto errout;
		default:
			err = -EINVAL;
			goto errout;
		}
	}
    // 调用filter操作集change()回调修改filter的配置参数,修改后的filter对象在fh中返回
	err = tp->ops->change(tp, cl, t->tcm_handle, tca, &fh);
	if (err == 0)
		tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER);

errout:
	if (cl)
		cops->put(q, cl);
	if (err == -EAGAIN)
		/* Replay the request. */
		goto replay;
	return err;
}
  • 1
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值