数据结构
Netlink消息: tcmsg
struct tcmsg
{
unsigned char tcm_family; // 对于tc,永远是AF_UNSPEC
unsigned char tcm__pad1; // 填充字段,无意义
unsigned short tcm__pad2;
int tcm_ifindex; // 关联的网络设备索引
__u32 tcm_handle; // 要操作的对象句柄,可以为0让内核选择
__u32 tcm_parent; // 父节点句柄
__u32 tcm_info; // 自定义使用
};
系统初始化
在初始化时,向路由套接字注册了qdisc、class和filter的命令处理函数。
static int __init pktsched_init(void)
{
// 注册两个默认的排队规则: pfifo和bfifo
register_qdisc(&pfifo_qdisc_ops);
register_qdisc(&bfifo_qdisc_ops);
// 创建/proc/net/psched维持文件
proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
// 在路由Netlink协议中注册qdisc和class的操作函数
rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
return 0;
}
subsys_initcall(pktsched_init);
static int __init tc_filter_init(void)
{
// 在路由Netlink协议中注册filter的操作函数
rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, , NULL);
rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL);
rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter, tc_dump_tfilter);
return 0;
}
subsys_initcall(tc_filter_init);
qdisc的Netlink接口
如初始化所示,排队规则支持添加、删除、查询三个接口,在看内核态实现之前,先来看看用户态tc(8)对qdisc命令的处理。
tc命令行格式
tc [ OPTIONS ] qdisc [ add | change | replace | link | delete ] dev DEV [ parent qdisc-id | root ] \
[ handle qdisc-id ] qdisc [ qdisc specific parameters ]
- parent和root参数指定了qdisc的parent句柄;root表示是根qdisc,否则parent应该某个class ID;
- handle参数指定了qdisc自己的句柄,qdisc句柄永远都是只有主号码;
- qdisc参数指定具体qdisc的名字,如"htb";
用户态qdsic命令: do_qdisc()
int do_qdisc(int argc, char **argv)
{
if (argc < 1)
return tc_qdisc_list(0, NULL);
if (matches(*argv, "add") == 0)
return tc_qdisc_modify(RTM_NEWQDISC, NLM_F_EXCL|NLM_F_CREATE, argc-1, argv+1);
if (matches(*argv, "change") == 0)
return tc_qdisc_modify(RTM_NEWQDISC, 0, argc-1, argv+1);
if (matches(*argv, "replace") == 0)
return tc_qdisc_modify(RTM_NEWQDISC, NLM_F_CREATE|NLM_F_REPLACE, argc-1, argv+1);
if (matches(*argv, "link") == 0)
return tc_qdisc_modify(RTM_NEWQDISC, NLM_F_REPLACE, argc-1, argv+1);
if (matches(*argv, "delete") == 0)
return tc_qdisc_modify(RTM_DELQDISC, 0, argc-1, argv+1);
if (matches(*argv, "list") == 0 || matches(*argv, "show") == 0
|| matches(*argv, "lst") == 0)
return tc_qdisc_list(argc-1, argv+1);
if (matches(*argv, "help") == 0) {
usage();
return 0;
}
fprintf(stderr, "Command \"%s\" is unknown, try \"tc qdisc help\".\n", *argv);
return -1;
}
根据命令(增删改、查)分别调用不同函数处理,重点看tc_qdisc_modify()的实现。
修改类命令处理: tc_qdisc_modify()
static int tc_qdisc_modify(int cmd, unsigned int flags, int argc, char **argv)
{
struct qdisc_util *q = NULL;
struct tc_estimator est = {};
struct {
struct tc_sizespec szopts;
__u16 *data;
} stab = {};
char d[16] = {}; // 保存命令行的网络设备名称
char k[16] = {}; // 保存qdisc的名称
struct {
struct nlmsghdr n;
struct tcmsg t;
char buf[TCA_BUF_MAX];
} req = {
.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)),
.n.nlmsg_flags = NLM_F_REQUEST | flags,
.n.nlmsg_type = cmd,
.t.tcm_family = AF_UNSPEC,
};
while (argc > 0) {
if (strcmp(*argv, "dev") == 0) { // 解析网络设备参数
NEXT_ARG();
if (d[0])
duparg("dev", *argv);
strncpy(d, *argv, sizeof(d)-1);
} else if (strcmp(*argv, "handle") == 0) { // 解析qdisc句柄
__u32 handle;
if (req.t.tcm_handle)
duparg("handle", *argv);
NEXT_ARG();
if (get_qdisc_handle(&handle, *argv))
invarg("invalid qdisc ID", *argv);
req.t.tcm_handle = handle;
} else if (strcmp(*argv, "root") == 0) { // 表示操作的是根qdisc,设置parent为TC_H_ROOT
if (req.t.tcm_parent) {
fprintf(stderr, "Error: \"root\" is duplicate parent ID\n");
return -1;
}
req.t.tcm_parent = TC_H_ROOT;
} else if (strcmp(*argv, "clsact") == 0) {
if (req.t.tcm_parent) {
fprintf(stderr, "Error: \"clsact\" is a duplicate parent ID\n");
return -1;
}
req.t.tcm_parent = TC_H_CLSACT;
strncpy(k, "clsact", sizeof(k) - 1);
q = get_qdisc_kind(k);
req.t.tcm_handle = TC_H_MAKE(TC_H_CLSACT, 0);
NEXT_ARG_FWD();
break;
} else if (strcmp(*argv, "ingress") == 0) { // 表示要操作的是接收方向的qdisc
if (req.t.tcm_parent) {
fprintf(stderr, "Error: \"ingress\" is a duplicate parent ID\n");
return -1;
}
req.t.tcm_parent = TC_H_INGRESS;
strncpy(k, "ingress", sizeof(k) - 1);
q = get_qdisc_kind(k);
req.t.tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0);
NEXT_ARG_FWD();
break;
} else if (strcmp(*argv, "parent") == 0) { // 解析parent句柄
__u32 handle;
NEXT_ARG();
if (req.t.tcm_parent)
duparg("parent", *argv);
if (get_tc_classid(&handle, *argv))
invarg("invalid parent ID", *argv);
req.t.tcm_parent = handle;
} else if (matches(*argv, "estimator") == 0) {
if (parse_estimator(&argc, &argv, &est))
return -1;
} else if (matches(*argv, "stab") == 0) {
if (parse_size_table(&argc, &argv, &stab.szopts) < 0)
return -1;
continue;
} else if (matches(*argv, "help") == 0) {
usage();
} else {
// 解析具体qdsic的名字
strncpy(k, *argv, sizeof(k)-1);
q = get_qdisc_kind(k);
argc--; argv++;
break;
}
argc--; argv++;
}
if (k[0]) // 设置qdisc名字属性
addattr_l(&req.n, sizeof(req), TCA_KIND, k, strlen(k)+1);
if (est.ewma_log)
addattr_l(&req.n, sizeof(req), TCA_RATE, &est, sizeof(est));
if (q) {
// 让具体的qdisc实现解析其特定参数
if (q->parse_qopt) {
if (q->parse_qopt(q, argc, argv, &req.n))
return 1;
} else if (argc) {
fprintf(stderr, "qdisc '%s' does not support option parsing\n", k);
return -1;
}
} else {
// 没有指定要操作的qdisc,失败处理
if (argc) {
if (matches(*argv, "help") == 0)
usage();
fprintf(stderr, "Garbage instead of arguments \"%s ...\". Try \"tc qdisc help\".\n", *argv);
return -1;
}
}
if (check_size_table_opts(&stab.szopts)) {
struct rtattr *tail;
if (tc_calc_size_table(&stab.szopts, &stab.data) < 0) {
fprintf(stderr, "failed to calculate size table.\n");
return -1;
}
tail = NLMSG_TAIL(&req.n);
addattr_l(&req.n, sizeof(req), TCA_STAB, NULL, 0);
addattr_l(&req.n, sizeof(req), TCA_STAB_BASE, &stab.szopts, sizeof(stab.szopts));
if (stab.data)
addattr_l(&req.n, sizeof(req), TCA_STAB_DATA, stab.data,
stab.szopts.tsize * sizeof(__u16));
tail->rta_len = (void *)NLMSG_TAIL(&req.n) - (void *)tail;
if (stab.data)
free(stab.data);
}
if (d[0]) { // 根据网卡名字获取要操作的网络设备索引
int idx;
ll_init_map(&rth);
idx = ll_name_to_index(d);
if (idx == 0) {
fprintf(stderr, "Cannot find device \"%s\"\n", d);
return 1;
}
req.t.tcm_ifindex = idx;
}
// 和内核交互
if (rtnl_talk(&rth, &req.n, NULL, 0) < 0)
return 2;
return 0;
}
新建&修改qdisc: tc_modify_qdisc()
内核对RTM_NEWQDISC命令的处理。
static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
{
struct net *net = sock_net(skb->sk);
struct tcmsg *tcm;
struct nlattr *tca[TCA_MAX + 1]; // 保存用户态设置的属性
struct net_device *dev;
u32 clid; // class id的缩写,代表qdisc的parent
struct Qdisc *q, *p;
int err;
if (net != &init_net)
return -EINVAL;
replay:
/* Reinit, just in case something touches this. */
// 解析Netlink消息,找到tc配置信息
tcm = NLMSG_DATA(n);
clid = tcm->tcm_parent; // parent class的ID,0表示不指定父排队规则
q = p = NULL;
// 找到要配置的网络设备对象
if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
return -ENODEV;
// 解析Netlink消息属性,保存到tca数组中
err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
if (err < 0)
return err;
// 按照是否指定了parent class ID分两种情况处理:
// 1. 如果没有指定,那么必须指定tcm_handle,表示要修改网络设备上的根qdisc;
// 2. 如果指定了,表示要修改的qdisc为属于某个类
// 该值通常都非0,因为即使要修改根qdisc,会指定root参数,此时clid 就是TC_H_ROOT
if (clid) {
// 根据parent class ID,找到要修改的qdisc
if (clid != TC_H_ROOT) { // 父类不是根qdisc
if (clid != TC_H_INGRESS) {
// 用class id的主号码先查询到父类对应的qdisc(就是父类qdisc的handle,
// 这是由命名规则决定的)
if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
return -ENOENT;
// 这种情况要修改的qdisc一定是class的叶子qdisc,否则是种错误
q = qdisc_leaf(p, clid);
} else { // 要修改的是接收队列qdisc
q = dev->rx_queue.qdisc_sleeping;
}
} else {
// 指定要修改的就是根qdisc,直接取队列的qdisc即可。
// 从实现来看实现说明根qdisc必须属于发送队列
struct netdev_queue *dev_queue;
dev_queue = netdev_get_tx_queue(dev, 0);
q = dev_queue->qdisc_sleeping;
}
/* It may be default qdisc, ignore it */
if (q && q->handle == 0)
q = NULL;
// 1. q为空表示当前配置的qdisc为默认qdisc pfifo_fast(add场景);
// 2. tcm->tcm_handle为0表示要让内核分配句柄并且要作为根qdisc(add场景);
// 3. q->handle != tcm->tcm_handle表示要修改指定的qdisc(replace场景)
if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
if (tcm->tcm_handle) {
if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
return -EEXIST;
if (TC_H_MIN(tcm->tcm_handle))
return -EINVAL;
if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
goto create_n_graft;
if (n->nlmsg_flags & NLM_F_EXCL)
return -EEXIST;
if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
return -EINVAL;
if (q == p || (p && check_loop(q, p, 0)))
return -ELOOP;
atomic_inc(&q->refcnt);
goto graft;
} else {
if (q == NULL)
goto create_n_graft;
/* This magic test requires explanation.
*
* We know, that some child q is already
* attached to this parent and have choice:
* either to change it or to create/graft new one.
*
* 1. We are allowed to create/graft only
* if CREATE and REPLACE flags are set.
*
* 2. If EXCL is set, requestor wanted to say,
* that qdisc tcm_handle is not expected
* to exist, so that we choose create/graft too.
*
* 3. The last case is when no flags are set.
* Alas, it is sort of hole in API, we
* cannot decide what to do unambiguously.
* For now we select create/graft, if
* user gave KIND, which does not match existing.
*/
if ((n->nlmsg_flags&NLM_F_CREATE) &&
(n->nlmsg_flags&NLM_F_REPLACE) &&
((n->nlmsg_flags&NLM_F_EXCL) ||
(tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))))
goto create_n_graft;
}
}
} else {
// 必须指定句柄以表明要修改的qdisc
if (!tcm->tcm_handle)
return -EINVAL;
// 从设备当前配置的收发队列的qdisc中找到要修改的qdisc对象
q = qdisc_lookup(dev, tcm->tcm_handle);
}
/* Change qdisc parameters 对应change命令 */
// 没有找到要修改的qdisc
if (q == NULL)
return -ENOENT;
if (n->nlmsg_flags & NLM_F_EXCL)
return -EEXIST;
// 句柄和名字不匹配
if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
return -EINVAL;
// 修改qdisc的配置参数
err = qdisc_change(q, tca);
if (err == 0) // 发布修改通知给用户态调用者(Netlink消息)
qdisc_notify(skb, n, clid, NULL, q);
return err;
create_n_graft:
// 没有指定create标记,视为参数错误
if (!(n->nlmsg_flags & NLM_F_CREATE))
return -ENOENT;
if (clid == TC_H_INGRESS)
// 为接收队列新建并关联一个qdisc
q = qdisc_create(dev, &dev->rx_queue, tcm->tcm_parent, tcm->tcm_parent, tca, &err);
else
// 为发送队列新建并关联一个排队规则
q = qdisc_create(dev, netdev_get_tx_queue(dev, 0), tcm->tcm_parent, tcm->tcm_handle, tca, &err);
if (q == NULL) {
if (err == -EAGAIN) // 重新走一遍上述流程
goto replay;
return err;
}
graft:
// 用新的qdisc替换老的qdisc
err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
if (err) {
if (q)
qdisc_destroy(q);
return err;
}
return 0;
}
创建qdisc对象: qdisc_create()
@dev、dev_queue:关联的设备和队列;
@parent: 新建qdisc的parent class ID,对于根qdisc,该值应该是TC_H_ROOT;
@handle: 新建qdisc的句柄,如果为0,则由内核自动分配一个;
@tca: 新建qdisc的参数;
@errp: 返回值
static struct Qdisc *qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
u32 parent, u32 handle, struct nlattr **tca, int *errp)
{
int err;
struct nlattr *kind = tca[TCA_KIND]; // 新建qdisc的名称,内核根据名称查找全局qdisc_base表,找到对应的模块
struct Qdisc *sch;
struct Qdisc_ops *ops;
struct qdisc_size_table *stab;
// 查找全局qdisc操作集表,找到对应的qdisc的操作函数集,注册一个新的qdisc本质上是向内核注册一个qdisc_ops对象
ops = qdisc_lookup_ops(kind);
err = -ENOENT;
if (ops == NULL)
goto err_out;
// 分配一个qdisc对象
sch = qdisc_alloc(dev_queue, ops);
if (IS_ERR(sch)) {
err = PTR_ERR(sch);
goto err_out2;
}
// 保存parent class的句柄
sch->parent = parent;
// 设置新qdisc的句柄
if (handle == TC_H_INGRESS) {
sch->flags |= TCQ_F_INGRESS;
handle = TC_H_MAKE(TC_H_INGRESS, 0);
lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
} else {
// 调用者未指定则由系统分配一个
if (handle == 0) {
handle = qdisc_alloc_handle(dev);
err = -ENOMEM;
if (handle == 0)
goto err_out3;
}
lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
}
sch->handle = handle;
// 调用qdisc_ops的初始化函数
if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
if (tca[TCA_STAB]) {
stab = qdisc_get_stab(tca[TCA_STAB]);
if (IS_ERR(stab)) {
err = PTR_ERR(stab);
goto err_out3;
}
sch->stab = stab;
}
// 处理rate属性
if (tca[TCA_RATE]) {
spinlock_t *root_lock;
if ((sch->parent != TC_H_ROOT) && !(sch->flags & TCQ_F_INGRESS))
root_lock = qdisc_root_sleeping_lock(sch);
else
root_lock = qdisc_lock(sch);
// 根据rate参数计算qdisc对象的bstats和rate_est参数
err = gen_new_estimator(&sch->bstats, &sch->rate_est, root_lock, tca[TCA_RATE]);
if (err) {
/*
* Any broken qdiscs that would require
* a ops->reset() here? The qdisc was never
* in action so it shouldn't be necessary.
*/
if (ops->destroy)
ops->destroy(sch);
goto err_out3;
}
}
qdisc_list_add(sch);
return sch;
}
// 错误处理
err_out3:
qdisc_put_stab(sch->stab);
dev_put(dev);
kfree((char *) sch - sch->padded);
err_out2:
module_put(ops->owner);
err_out:
*errp = err;
return NULL;
}
关联新的qdisc: qdisc_graft()
该函数将新的qdisc关联到设备(作为设备的根qdisc),或者关联到指定的parent class(用classid指定),并且释放旧的qdisc(old参数指定,适用于替换场景)。
/* Graft qdisc "new" to class "classid" of qdisc "parent" or
* to device "dev".
*
* When appropriate send a netlink notification using 'skb'
* and "n".
*
* On success, destroy old qdisc.
*/
static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, struct sk_buff *skb,
struct nlmsghdr *n, u32 classid, struct Qdisc *new, struct Qdisc *old)
{
struct Qdisc *q = old;
int err = 0;
if (parent == NULL) {
// 将新的qdisc设置为设备队列的根qdisc
unsigned int i, num_q, ingress;
ingress = 0;
num_q = dev->num_tx_queues;
// 输入队列时调整参数
if ((q && q->flags & TCQ_F_INGRESS) || (new && new->flags & TCQ_F_INGRESS)) {
num_q = 1;
ingress = 1;
}
// 关闭设备
if (dev->flags & IFF_UP)
dev_deactivate(dev);
for (i = 0; i < num_q; i++) {
// 找到要操作的设备队列
struct netdev_queue *dev_queue = &dev->rx_queue;
if (!ingress)
dev_queue = netdev_get_tx_queue(dev, i);
// jiang网络设备级别的qdisc关联
old = dev_graft_qdisc(dev_queue, new);
if (new && i > 0)
atomic_inc(&new->refcnt);
// 向用户态调用者发送Netlink通知消息
notify_and_destroy(skb, n, classid, old, new);
}
// 重新使能设备
if (dev->flags & IFF_UP)
dev_activate(dev);
} else {
// 将新的qdisc与parent qdisc关联
// 找到parent qdisc的类操作函数集
const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
err = -EINVAL;
if (cops) {
// 调用类操作集的graft()回调完成关联
unsigned long cl = cops->get(parent, classid);
if (cl) {
// 注意参数: parent为父qdisc,new为要关联的新的孩子qdisc,
// cl参数用于让class标识要替换的孩子,old保存父qdisc原来的孩子指针(出参)
err = cops->graft(parent, cl, new, &old);
cops->put(parent, cl);
}
}
if (!err)
// 向用户态调用者发送Netlink通知消息
notify_and_destroy(skb, n, classid, old, new);
}
return err;
}
dev_graft_qdisc()
将qdisc指定为队列的根qdisc,并返回队列原来的根qdisc。
/* Attach toplevel qdisc to device queue. */
static struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, struct Qdisc *qdisc)
{
struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
spinlock_t *root_lock;
root_lock = qdisc_lock(oqdisc);
spin_lock_bh(root_lock);
// 复位旧的qdisc
if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
qdisc_reset(oqdisc);
// 将新的qdisc指定为队列新的qdisc
if (qdisc == NULL)
// 该逻辑可以使得当传入的qdisc为NULL时,可以实现删除qdisc的效果
qdisc = &noop_qdisc;
dev_queue->qdisc_sleeping = qdisc;
rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
spin_unlock_bh(root_lock);
// 返回队列原来的根qdisc
return oqdisc;
}
修改qdisc配置: qdisc_change()
qdisc_change()用于变更某个qdisc的配置参数。
static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
{
struct qdisc_size_table *stab = NULL;
int err = 0;
if (tca[TCA_OPTIONS]) {
if (sch->ops->change == NULL)
return -EINVAL;
// 调用qdisc的change()回调
err = sch->ops->change(sch, tca[TCA_OPTIONS]);
if (err)
return err;
}
if (tca[TCA_STAB]) {
stab = qdisc_get_stab(tca[TCA_STAB]);
if (IS_ERR(stab))
return PTR_ERR(stab);
}
qdisc_put_stab(sch->stab);
sch->stab = stab;
if (tca[TCA_RATE])
/* NB: ignores errors from replace_estimator
because change can't be undone. */
gen_replace_estimator(&sch->bstats, &sch->rate_est,
qdisc_root_sleeping_lock(sch),
tca[TCA_RATE]);
return 0;
}
获取&删除qdisc: tc_get_qdisc()
static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
{
struct net *net = sock_net(skb->sk);
struct tcmsg *tcm = NLMSG_DATA(n); // 指向tc消息开始
struct nlattr *tca[TCA_MAX + 1];
struct net_device *dev;
u32 clid = tcm->tcm_parent; // parent class索引
struct Qdisc *q = NULL;
struct Qdisc *p = NULL;
int err;
if (net != &init_net)
return -EINVAL;
// 根据索引找到网络设备
if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
return -ENODEV;
// 解析Netlink消息参数,保存到tca中
err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
if (err < 0)
return err;
// 和tc_modify_qdisc()中的逻辑类似,都是为了找到要操作的qdisc
if (clid) {
if (clid != TC_H_ROOT) {
if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
return -ENOENT;
q = qdisc_leaf(p, clid);
} else { /* ingress */
q = dev->rx_queue.qdisc_sleeping;
}
} else {
struct netdev_queue *dev_queue;
dev_queue = netdev_get_tx_queue(dev, 0);
q = dev_queue->qdisc_sleeping;
}
// 搜索失败,没有此qdisc
if (!q)
return -ENOENT;
// 句柄要匹配
if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
return -EINVAL;
} else {
if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
return -ENOENT;
}
// 检查qdisc的名字是否匹配
if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
return -EINVAL;
// 根据消息类型决定是删除qdisc,还是查询qdisc
if (n->nlmsg_type == RTM_DELQDISC) {
if (!clid)
return -EINVAL;
if (q->handle == 0)
return -ENOENT;
if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
return err;
} else {
qdisc_notify(skb, n, clid, NULL, q);
}
return 0;
}
class的Netlink接口
从上面看到,class的创建、删除以及查询在内核都是同一个接口tc_ctl_tclass(),不过还是先来看看用户态tc(8)的class命令。
tc命令行格式
tc [ OPTIONS ] class [ add | change | replace | delete ] dev DEV parent qdisc-id [ classid class-id ] \
qdisc [ qdisc specific parameters ]
- parent参数指定class的parent,可以是qdisc的句柄,也可以是其它class句柄;
- classid参数指定要操作的class的句柄;
- qdisc参数指定操作的class是属于哪个qdisc,指定的是qdisc的名字;
注意: class一定是作用于某个qdisc的,其具体的参数需要参考qdisc的使用指导。
用户态class命令: do_class()
和do_qdisc()非常的像,我们重点看tc_class_modify()的处理。
int do_class(int argc, char **argv)
{
if (argc < 1)
return tc_class_list(0, NULL);
if (matches(*argv, "add") == 0)
return tc_class_modify(RTM_NEWTCLASS, NLM_F_EXCL|NLM_F_CREATE, argc-1, argv+1);
if (matches(*argv, "change") == 0)
return tc_class_modify(RTM_NEWTCLASS, 0, argc-1, argv+1);
if (matches(*argv, "replace") == 0)
return tc_class_modify(RTM_NEWTCLASS, NLM_F_CREATE, argc-1, argv+1);
if (matches(*argv, "delete") == 0)
return tc_class_modify(RTM_DELTCLASS, 0, argc-1, argv+1);
#if 0
if (matches(*argv, "get") == 0)
return tc_class_get(RTM_GETTCLASS, 0, argc-1, argv+1);
#endif
if (matches(*argv, "list") == 0 || matches(*argv, "show") == 0
|| matches(*argv, "lst") == 0)
return tc_class_list(argc-1, argv+1);
if (matches(*argv, "help") == 0) {
usage();
return 0;
}
fprintf(stderr, "Command \"%s\" is unknown, try \"tc class help\".\n", *argv);
return -1;
}
修改类处理命令: tc_class_modify()
static int tc_class_modify(int cmd, unsigned int flags, int argc, char **argv)
{
struct {
struct nlmsghdr n;
struct tcmsg t;
char buf[4096];
} req = {
.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)),
.n.nlmsg_flags = NLM_F_REQUEST | flags,
.n.nlmsg_type = cmd,
.t.tcm_family = AF_UNSPEC,
};
struct qdisc_util *q = NULL;
struct tc_estimator est = {};
char d[16] = {};
char k[16] = {};
while (argc > 0) {
if (strcmp(*argv, "dev") == 0) { // 解析网络设备参数
NEXT_ARG();
if (d[0])
duparg("dev", *argv);
strncpy(d, *argv, sizeof(d)-1);
} else if (strcmp(*argv, "classid") == 0) { // 解析class ID参数
__u32 handle;
NEXT_ARG();
if (req.t.tcm_handle)
duparg("classid", *argv);
if (get_tc_classid(&handle, *argv))
invarg("invalid class ID", *argv);
req.t.tcm_handle = handle;
} else if (strcmp(*argv, "handle") == 0) {
fprintf(stderr, "Error: try \"classid\" instead of \"handle\"\n");
return -1;
} else if (strcmp(*argv, "root") == 0) {
if (req.t.tcm_parent) {
fprintf(stderr, "Error: \"root\" is duplicate parent ID.\n");
return -1;
}
req.t.tcm_parent = TC_H_ROOT;
} else if (strcmp(*argv, "parent") == 0) { // 解析parent ID
__u32 handle;
NEXT_ARG();
if (req.t.tcm_parent)
duparg("parent", *argv);
if (get_tc_classid(&handle, *argv))
invarg("invalid parent ID", *argv);
// 根class的parent是qdisc,其它class的parent是某个class
req.t.tcm_parent = handle;
} else if (matches(*argv, "estimator") == 0) {
if (parse_estimator(&argc, &argv, &est))
return -1;
} else if (matches(*argv, "help") == 0) {
usage();
} else {
// 根据class的名字找到class实现
strncpy(k, *argv, sizeof(k)-1);
q = get_qdisc_kind(k);
argc--; argv++;
break;
}
argc--; argv++;
}
if (k[0])
addattr_l(&req.n, sizeof(req), TCA_KIND, k, strlen(k)+1);
if (est.ewma_log)
addattr_l(&req.n, sizeof(req), TCA_RATE, &est, sizeof(est));
if (q) {
// 调用特定qdisc的class参数解析接口解析特定参数
if (q->parse_copt == NULL) {
fprintf(stderr, "Error: Qdisc \"%s\" is classless.\n", k);
return 1;
}
if (q->parse_copt(q, argc, argv, &req.n))
return 1;
} else {
if (argc) {
if (matches(*argv, "help") == 0)
usage();
fprintf(stderr, "Garbage instead of arguments \"%s ...\". Try \"tc class help\".", *argv);
return -1;
}
}
if (d[0]) { // 将网络设备名称转换为网络设备索引
ll_init_map(&rth);
if ((req.t.tcm_ifindex = ll_name_to_index(d)) == 0) {
fprintf(stderr, "Cannot find device \"%s\"\n", d);
return 1;
}
}
// 和内核通信
if (rtnl_talk(&rth, &req.n, NULL, 0) < 0)
return 2;
return 0;
}
增删改查class: tc_ctl_tclass()
static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
{
struct net *net = sock_net(skb->sk);
struct netdev_queue *dev_queue;
struct tcmsg *tcm = NLMSG_DATA(n); // Netlink消息
struct nlattr *tca[TCA_MAX + 1];
struct net_device *dev;
struct Qdisc *q = NULL;
const struct Qdisc_class_ops *cops;
unsigned long cl = 0;
unsigned long new_cl;
u32 pid = tcm->tcm_parent; // parent句柄,可能是qdisc的句柄,也可能是某个class的句柄
u32 clid = tcm->tcm_handle; // 本class的句柄
u32 qid = TC_H_MAJ(clid); // 该class要归属的qdisc的句柄
int err;
if (net != &init_net)
return -EINVAL;
// 找到对应的网络设备
if ((dev = __dev_get_by_index(&init_net, tcm->tcm_ifindex)) == NULL)
return -ENODEV;
// 解析Netlink消息携带的属性参数
err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
if (err < 0)
return err;
/*
parent == TC_H_UNSPEC - unspecified parent.
parent == TC_H_ROOT - class is root, which has no parent.
parent == X:0 - parent is root class.
parent == X:Y - parent is a node in hierarchy.
parent == 0:Y - parent is X:Y, where X:0 is qdisc.
handle == 0:0 - generate handle from kernel pool.
handle == 0:Y - class is X:Y, where X:0 is qdisc.
handle == X:Y - clear.
handle == X:0 - root class.
*/
/* Step 1. Determine qdisc handle X:0 */
dev_queue = netdev_get_tx_queue(dev, 0);
if (pid != TC_H_ROOT) {
// 校验并修正pid和qid的值
u32 qid1 = TC_H_MAJ(pid);
if (qid && qid1) {
/* If both majors are known, they must be identical. */
if (qid != qid1)
return -EINVAL;
} else if (qid1) {
qid = qid1;
} else if (qid == 0)
qid = dev_queue->qdisc_sleeping->handle;
/* Now qid is genuine qdisc handle consistent
both with parent and child.
TC_H_MAJ(pid) still may be unspecified, complete it now.
*/
if (pid)
pid = TC_H_MAKE(qid, pid);
} else {
// 未指定属于哪个qdisc,默认对应根qdisc
if (qid == 0)
qid = dev_queue->qdisc_sleeping->handle;
}
/* OK. Locate qdisc 根据句柄找到qdisc对象,必须已经存在*/
if ((q = qdisc_lookup(dev, qid)) == NULL)
return -ENOENT;
// 如果qdisc没有提供类操作集,那么该qdisc一定是无类qdisc,它是不支持关联class的
cops = q->ops->cl_ops;
if (cops == NULL)
return -EINVAL;
/* Now try to get class 用户态传入的classid实际上只有次号码 */
if (clid == 0) {
if (pid == TC_H_ROOT)
clid = qid;
} else
clid = TC_H_MAKE(qid, clid);
// 调用qdisc类操作集的get()回调,根据clid获取class对象,用参数(cl标识,作为后续回调的入参)
// 如果class已经存在,预期get()要增加class对象的引用计数并返回其指针等可以标识该对象的信息cl
// 如果class不存在,则需要返回0,即NULL
if (clid)
cl = cops->get(q, clid);
if (cl == 0) { // class不存在,可能是新建场景
// 非创建命令,此时没有找到对应的class,属于失败场景
err = -ENOENT;
if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
goto out;
} else { // 获取class成功
switch (n->nlmsg_type) {
case RTM_NEWTCLASS:
// class已经存在,但是指定了互斥标记,失败
err = -EEXIST;
if (n->nlmsg_flags&NLM_F_EXCL)
goto out;
break;
case RTM_DELTCLASS:
// 删除class
err = cops->delete(q, cl);
if (err == 0)
tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
goto out;
case RTM_GETTCLASS:
// 获取class,返回
err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
goto out;
default:
err = -EINVAL;
goto out;
}
}
// 修改qdisc(q)下的某个class(clid),如果该class不存在,则新建它
new_cl = cl;
err = cops->change(q, clid, pid, tca, &new_cl);
if (err == 0)
tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
out:
// 释放引用
if (cl)
cops->put(q, cl);
return err;
}
filter的neilink接口
如初始化所示,filter的增加、删除、修改都是tc_ctl_tfilter()函数完成,同样先看用户态tc(8)的filter命令实现。
tc命令行格式
# 一行命令格式如下:
tc [ OPTIONS ] filter [ add | change | replace | delete | get ] dev DEV [ parent qdisc-id | root ] \
[ handle filter-id ] protocol protocol prio priority filtertype [ filtertype specific parameters ] \
flowid flow-id
- handle参数指定filter的句柄;不同filter对该字段的解释不同;
- protocol参数指定协议,取值ETH_P_IP等值,表示数据包所属的三层报文;
- prio参数指定filter优先级,值越小,优先级越高,匹配时优先级高的会优先匹配;
- filtertype参数为具体的filter名字,如u32等;
用户态filter命令: do_filter()
重点看tc_filter_modify()的处理。
int do_filter(int argc, char **argv)
{
if (argc < 1)
return tc_filter_list(0, NULL);
if (matches(*argv, "add") == 0)
return tc_filter_modify(RTM_NEWTFILTER, NLM_F_EXCL|NLM_F_CREATE, argc-1, argv+1);
if (matches(*argv, "change") == 0)
return tc_filter_modify(RTM_NEWTFILTER, 0, argc-1, argv+1);
if (matches(*argv, "replace") == 0)
return tc_filter_modify(RTM_NEWTFILTER, NLM_F_CREATE, argc-1, argv+1);
if (matches(*argv, "delete") == 0)
return tc_filter_modify(RTM_DELTFILTER, 0, argc-1, argv+1);
if (matches(*argv, "get") == 0)
return tc_filter_get(RTM_GETTFILTER, 0, argc-1, argv+1);
if (matches(*argv, "list") == 0 || matches(*argv, "show") == 0 || matches(*argv, "lst") == 0)
return tc_filter_list(argc-1, argv+1);
if (matches(*argv, "help") == 0) {
usage();
return 0;
}
fprintf(stderr, "Command \"%s\" is unknown, try \"tc filter help\".\n", *argv);
return -1;
}
修改类处理命令: tc_filter_modify()
static int tc_filter_modify(int cmd, unsigned int flags, int argc, char **argv)
{
struct {
struct nlmsghdr n;
struct tcmsg t;
char buf[MAX_MSG];
} req = {
.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)),
.n.nlmsg_flags = NLM_F_REQUEST | flags,
.n.nlmsg_type = cmd,
.t.tcm_family = AF_UNSPEC,
};
struct filter_util *q = NULL;
__u32 prio = 0; // 默认优先级为0
__u32 protocol = 0; // 默认协议为0
int protocol_set = 0;
__u32 chain_index;
int chain_index_set = 0;
char *fhandle = NULL;
char d[16] = {};
char k[16] = {};
struct tc_estimator est = {};
// 新曾filter的默认协议为ALL
if (cmd == RTM_NEWTFILTER && flags & NLM_F_CREATE)
protocol = htons(ETH_P_ALL);
while (argc > 0) {
if (strcmp(*argv, "dev") == 0) { // 解析网络设备名称
NEXT_ARG();
if (d[0])
duparg("dev", *argv);
strncpy(d, *argv, sizeof(d)-1);
} else if (strcmp(*argv, "root") == 0) { // filter关联到根qdisc
if (req.t.tcm_parent) {
fprintf(stderr, "Error: \"root\" is duplicate parent ID\n");
return -1;
}
req.t.tcm_parent = TC_H_ROOT;
} else if (strcmp(*argv, "ingress") == 0) { // filter关联到入口队列qdisc
if (req.t.tcm_parent) {
fprintf(stderr, "Error: \"ingress\" is duplicate parent ID\n");
return -1;
}
req.t.tcm_parent = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS);
} else if (strcmp(*argv, "egress") == 0) {
if (req.t.tcm_parent) {
fprintf(stderr, "Error: \"egress\" is duplicate parent ID\n");
return -1;
}
req.t.tcm_parent = TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS);
} else if (strcmp(*argv, "parent") == 0) { // 指定关联到哪个qdisc或者class
__u32 handle;
NEXT_ARG();
if (req.t.tcm_parent)
duparg("parent", *argv);
if (get_tc_classid(&handle, *argv))
invarg("Invalid parent ID", *argv);
req.t.tcm_parent = handle;
} else if (strcmp(*argv, "handle") == 0) { // 指定filter句柄,句柄值的解析让具体filter去执行
NEXT_ARG();
if (fhandle)
duparg("handle", *argv);
fhandle = *argv;
} else if (matches(*argv, "preference") == 0 || matches(*argv, "priority") == 0) { // 优先级
NEXT_ARG();
if (prio)
duparg("priority", *argv);
if (get_u32(&prio, *argv, 0) || prio > 0xFFFF)
invarg("invalid priority value", *argv);
} else if (matches(*argv, "protocol") == 0) { // 设置协议,取值见ETH_IP系列值
__u16 id;
NEXT_ARG();
if (protocol_set)
duparg("protocol", *argv);
if (ll_proto_a2n(&id, *argv))
invarg("invalid protocol", *argv);
protocol = id;
protocol_set = 1;
} else if (matches(*argv, "chain") == 0) {
NEXT_ARG();
if (chain_index_set)
duparg("chain", *argv);
if (get_u32(&chain_index, *argv, 0))
invarg("invalid chain index value", *argv);
chain_index_set = 1;
} else if (matches(*argv, "estimator") == 0) {
if (parse_estimator(&argc, &argv, &est) < 0)
return -1;
} else if (matches(*argv, "help") == 0) {
usage();
return 0;
} else {
// filter名称
strncpy(k, *argv, sizeof(k)-1);
q = get_filter_kind(k);
argc--; argv++;
break;
}
argc--; argv++;
}
// tcm_info的高16位是优先级,低16位是协议号
req.t.tcm_info = TC_H_MAKE(prio<<16, protocol);
if (chain_index_set)
addattr32(&req.n, sizeof(req), TCA_CHAIN, chain_index);
if (k[0]) // 设置filter名称属性
addattr_l(&req.n, sizeof(req), TCA_KIND, k, strlen(k)+1);
if (q) {
// 调用具体的filter的参数解析回调
if (q->parse_fopt(q, fhandle, argc, argv, &req.n))
return 1;
} else {
if (fhandle) {
fprintf(stderr, "Must specify filter type when using \"handle\"\n");
return -1;
}
if (argc) {
if (matches(*argv, "help") == 0)
usage();
fprintf(stderr, "Garbage instead of arguments \"%s ...\". Try \"tc filter help\".\n", *argv);
return -1;
}
}
if (est.ewma_log)
addattr_l(&req.n, sizeof(req), TCA_RATE, &est, sizeof(est));
if (d[0]) { // 设置网络设备索引
ll_init_map(&rth);
req.t.tcm_ifindex = ll_name_to_index(d);
if (req.t.tcm_ifindex == 0) {
fprintf(stderr, "Cannot find device \"%s\"\n", d);
return 1;
}
}
// 和内核通信
if (rtnl_talk(&rth, &req.n, NULL, 0) < 0) {
fprintf(stderr, "We have an error talking to the kernel\n");
return 2;
}
return 0;
}
内核态增删改查filter: tc_ctl_tfilter()
static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
{
struct net *net = sock_net(skb->sk);
struct nlattr *tca[TCA_MAX + 1];
spinlock_t *root_lock;
struct tcmsg *t;
u32 protocol;
u32 prio;
u32 nprio;
u32 parent;
struct net_device *dev;
struct Qdisc *q;
struct tcf_proto **back, **chain;
struct tcf_proto *tp;
struct tcf_proto_ops *tp_ops;
const struct Qdisc_class_ops *cops;
unsigned long cl;
unsigned long fh;
int err;
if (net != &init_net)
return -EINVAL;
replay:
t = NLMSG_DATA(n);
// 提取优先级和协议字段
protocol = TC_H_MIN(t->tcm_info);
prio = TC_H_MAJ(t->tcm_info);
nprio = prio;
parent = t->tcm_parent;
cl = 0;
if (prio == 0) { // prio为0表示要内核指定一个默认优先级
/* If no priority is given, user wants we allocated it. */
if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE))
return -ENOENT;
prio = TC_H_MAKE(0x80000000U, 0U);
}
// 找到对应的网络设备对象
dev = __dev_get_by_index(&init_net, t->tcm_ifindex);
if (dev == NULL)
return -ENODEV;
// 解析Netlink消息数据
err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL);
if (err < 0)
return err;
// 找到qdisc
if (!parent) {
// 未指定parent,那么默认将filter关联到根qdisc
struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, 0);
q = dev_queue->qdisc_sleeping;
parent = q->handle;
} else {
// 无论parent为qdisc或者class的句柄,其主号码肯定都是qdisc的句柄
q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent));
if (q == NULL)
return -EINVAL;
}
// 无类qdisc是不允许关联filter的,没有意义,返回失败
if ((cops = q->ops->cl_ops) == NULL)
return -EINVAL;
// parent的次号码不为0,说明filter关联的是一个class,找到该class对象
if (TC_H_MIN(parent)) {
cl = cops->get(q, parent);
if (cl == 0)
return -ENOENT;
}
// 获取qdisc或其class的filter链表,注意,如果关联的是class,那么cl参数已经是该class对象了
chain = cops->tcf_chain(q, cl);
err = -EINVAL;
if (chain == NULL)
goto errout;
/* Check the chain for existence of proto-tcf with this priority */
// 查找filter链表,这次遍历有两个作用:
// 1. 找到要操作的filter(可能没有),并且找到要将其插入的位置,prio值小的排在开头
// 2. 优先级相同且协议相同的filter只能关联一个
// 从这里的逻辑可以看出,filter的句柄并不是用来标识filter对象的,内核是先从filter的parent
// 找到filter链表,然后从中找到优先级相同且协议号相同的filter
for (back = chain; (tp=*back) != NULL; back = &tp->next) {
if (tp->prio >= prio) {
if (tp->prio == prio) {
if (!nprio || (tp->protocol != protocol && protocol))
goto errout;
} else
tp = NULL;
break;
}
}
root_lock = qdisc_root_sleeping_lock(q);
// 同一个filter链表下,相同优先级且协议号也相同的filter内容会被组织到同一个tcf_proto对象中
if (tp == NULL) {
/* Proto-tcf does not exist, create new one */
// 此时必须指定名字和协议
if (tca[TCA_KIND] == NULL || !protocol)
goto errout;
// 新建参数不合法
err = -ENOENT;
if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE))
goto errout;
/* Create new proto tcf */
// 分配tcf_proto对象并对其进行初始化
err = -ENOBUFS;
tp = kzalloc(sizeof(*tp), GFP_KERNEL);
if (tp == NULL)
goto errout;
err = -ENOENT;
// 根据filter名字找到filter操作集
tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND]);
if (tp_ops == NULL) { // 尝试加载该match模块部分代码忽略
...
kfree(tp);
goto errout;
}
tp->ops = tp_ops;
tp->protocol = protocol;
tp->prio = nprio ? : tcf_auto_prio(*back); // 可以让内核自动选一个优先级
tp->q = q;
tp->classify = tp_ops->classify;
tp->classid = parent;
// 调用filter操作集的init()回调
err = tp_ops->init(tp);
if (err != 0) {
module_put(tp_ops->owner);
kfree(tp);
goto errout;
}
// 将新建的filter插入到filter链表中
spin_lock_bh(root_lock);
tp->next = *back;
*back = tp;
spin_unlock_bh(root_lock);
} else if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], tp->ops->kind))
goto errout;
// 调用filter操作集get()回调获取filter
fh = tp->ops->get(tp, t->tcm_handle);
if (fh == 0) {
if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) {
spin_lock_bh(root_lock);
*back = tp->next;
spin_unlock_bh(root_lock);
// 删除filter
tfilter_notify(skb, n, tp, fh, RTM_DELTFILTER);
tcf_destroy(tp);
err = 0;
goto errout;
}
err = -ENOENT;
if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags & NLM_F_CREATE))
goto errout;
} else {
switch (n->nlmsg_type) {
case RTM_NEWTFILTER:
err = -EEXIST;
if (n->nlmsg_flags & NLM_F_EXCL)
goto errout;
break;
case RTM_DELTFILTER:
err = tp->ops->delete(tp, fh);
if (err == 0)
tfilter_notify(skb, n, tp, fh, RTM_DELTFILTER);
goto errout;
case RTM_GETTFILTER:
err = tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER);
goto errout;
default:
err = -EINVAL;
goto errout;
}
}
// 调用filter操作集change()回调修改filter的配置参数,修改后的filter对象在fh中返回
err = tp->ops->change(tp, cl, t->tcm_handle, tca, &fh);
if (err == 0)
tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER);
errout:
if (cl)
cops->put(q, cl);
if (err == -EAGAIN)
/* Replay the request. */
goto replay;
return err;
}