文章目录
在 策略路由之初始化中有看到,策略路由框架在初始化时向路由套接字注册三个子命令:RTM_NEWRULE、RTM_DELRULE和RTM_GETRULE分别用于策略路由规则的添加、删除和查询。这篇笔记就来看看内核是如何实现这些接口的,从这些接口的实现中也可以进一步理解策略路由规则相关数据结构中各个字段的含义。
源代码路径 | 说明 |
---|---|
net/core/fib_rules.c | 策略路由非协议相关实现 |
net/ipv4/fib_rules.c | 策略路由的IPv4实现 |
include/net/fib_rules.h | 策略路由相关数据结构定义 |
命令行
以ip命令为例:
// 增加、删除、查询几类命令
ip rule [ list | add | del | flush ] SELECTOR ACTION
SELECTOR := [ from PREFIX ] [ to PREFIX ] [ tos TOS ] [ fwmark FWMARK[/MASK] ] [ iif STRING ] \
[ oif STRING ] [ pref NUMBER ]
ACTION := [ table TABLE_ID ] [ prohibit | reject | unreachable ]
TABLE_ID := [ local | main | default | NUMBER ]
netlink消息内容
策略路由消息头: fib_rule_hdr
策略路由的Netlink消息首部,携带的信息是fib_rule_hdr对象:
struct fib_rule_hdr
{
__u8 family;
__u8 dst_len; // 策略路由规则中源和目的地址的掩码长度,长度非0才会去解析对应的属性
__u8 src_len;
__u8 tos;
__u8 table; // 对应rule->table字段
__u8 res1; /* reserved */
__u8 res2; /* reserved */
__u8 action; // 对应rule->table字段
__u32 flags; // 对应rule->flags字段
};
策略路由在netlink中定义了如下属性值,部分属性和fib_rule_hdr中的字段有重复,如果都有指定,那么以属性值为准。
enum
{
FRA_UNSPEC,
FRA_DST, /* destination address */
FRA_SRC, /* source address */
FRA_IFNAME, /* interface name */
FRA_GOTO, /* target to jump to (FR_ACT_GOTO) */
FRA_UNUSED2,
FRA_PRIORITY, /* priority/preference */
FRA_UNUSED3,
FRA_UNUSED4,
FRA_UNUSED5,
FRA_FWMARK, /* mark */
FRA_FLOW, /* flow/class id */
FRA_UNUSED6,
FRA_UNUSED7,
FRA_UNUSED8,
FRA_TABLE, /* Extended table id */
FRA_FWMASK, /* mask for netfilter mark */
__FRA_MAX
};
策略路由添加: fib_nl_newrule()
static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
{
struct net *net = sock_net(skb->sk);
struct fib_rule_hdr *frh = nlmsg_data(nlh); // netlink消息内容
struct fib_rules_ops *ops = NULL;
struct fib_rule *rule, *r, *last = NULL;
struct nlattr *tb[FRA_MAX+1]; // 属性数据
int err = -EINVAL, unresolved = 0;
if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
goto errout;
// 根据family找到协议对应的策略路由操作集
ops = lookup_rules_ops(net, frh->family);
if (ops == NULL) {
err = -EAFNOSUPPORT;
goto errout;
}
// 解析属性参数
err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy);
if (err < 0)
goto errout;
// 校验源和目的地址长度信息是否和协议族指定的ops->addr_size匹配
err = validate_rulemsg(frh, tb, ops);
if (err < 0)
goto errout;
// 分配策略路由规则对象
rule = kzalloc(ops->rule_size, GFP_KERNEL);
if (rule == NULL) {
err = -ENOMEM;
goto errout;
}
rule->fr_net = hold_net(net);
if (tb[FRA_PRIORITY]) // 如果有指定优先级,那么设定策略路由规则优先级
rule->pref = nla_get_u32(tb[FRA_PRIORITY]);
// 这里,ifindex可能有三种值:0(没有设置);-1(设置了一个不存在的接口);某个接口的索引
if (tb[FRA_IFNAME]) {
struct net_device *dev;
rule->ifindex = -1;
nla_strlcpy(rule->ifname, tb[FRA_IFNAME], IFNAMSIZ);
dev = __dev_get_by_name(net, rule->ifname);
if (dev)
rule->ifindex = dev->ifindex;
}
if (tb[FRA_FWMARK]) { // 设置fw mark值
rule->mark = nla_get_u32(tb[FRA_FWMARK]);
if (rule->mark)
/* compatibility: if the mark value is non-zero all bits
* are compared unless a mask is explicitly specified.
*/
rule->mark_mask = 0xFFFFFFFF; // 默认的mask,所有位都有效
}
if (tb[FRA_FWMASK]) // // 设置fw mark mask值
rule->mark_mask = nla_get_u32(tb[FRA_FWMASK]);
rule->action = frh->action;
rule->flags = frh->flags;
// 如果fib_rule_hdr和属性中都指定了table,那么以属性为准
rule->table = frh_get_table(frh, tb);
// 没有指定策略路由规则优先级,但是协议族提供了default_pref()回调,
// 那么使用该回调获取规则的默认优先级
if (!rule->pref && ops->default_pref)
rule->pref = ops->default_pref(ops);
err = -EINVAL;
if (tb[FRA_GOTO]) { // 指定了跳转属性
if (rule->action != FR_ACT_GOTO) // action和属性必须要匹配,否则为参数非法
goto errout_free;
rule->target = nla_get_u32(tb[FRA_GOTO]);
// 不允许从优先级低的规则跳转到优先级高的规则,这样会是的匹配过程形成环路
if (rule->target <= rule->pref)
goto errout_free;
// 找到要跳转的规则的指针,可见跳转目标是用优先级标识的
list_for_each_entry(r, &ops->rules_list, list) {
if (r->pref == rule->target) {
rule->ctarget = r;
break;
}
}
// 跳转的目标规则尚不存在,没关系,先标记为unresolved,
// 每次有新规则添加时,会尝试解决这种无效跳转
if (rule->ctarget == NULL)
unresolved = 1;
} else if (rule->action == FR_ACT_GOTO)
goto errout_free;
// 调用协议族接口对协议特有的策略路由规则字段进行设置
err = ops->configure(rule, skb, nlh, frh, tb);
if (err < 0)
goto errout_free;
// 根据优先级由高到底的顺序,为新增规则找到插入位置
list_for_each_entry(r, &ops->rules_list, list) {
if (r->pref > rule->pref)
break;
last = r;
}
// 规则的引用计数+1
fib_rule_get(rule);
// 新增的路由规则可能是原有规则的跳转目标,尝试解决那些规则的无效跳转
if (ops->unresolved_rules) {
/*
* There are unresolved goto rules in the list, check if
* any of them are pointing to this new rule.
*/
list_for_each_entry(r, &ops->rules_list, list) {
if (r->action == FR_ACT_GOTO && r->target == rule->pref) {
BUG_ON(r->ctarget != NULL);
rcu_assign_pointer(r->ctarget, rule);
if (--ops->unresolved_rules == 0)
break;
}
}
}
// 记录表中有多少个跳转规则
if (rule->action == FR_ACT_GOTO)
ops->nr_goto_rules++;
if (unresolved) // 如果最终还是没有解决跳转关系,增加ops中的计数器
ops->unresolved_rules++;
// 将新的策略路由规则插入表中
if (last)
list_add_rcu(&rule->list, &last->list);
else
list_add_rcu(&rule->list, &ops->rules_list);
// 通知用户态,有新的策略路由规则添加了
notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).pid);
flush_route_cache(ops); // 刷新路由缓存
rules_ops_put(ops);
return 0;
errout_free:
release_net(rule->fr_net);
kfree(rule);
errout:
rules_ops_put(ops);
return err;
}
static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
IPv4默认优先级回调: fib4_rule_default_pref()
就是遍历已配置策略路由规则链表,寻找一个尚未使用的最高优先级使用,这不仅仅可以让分配的优先级是唯一的,并且表明不指定优先级添加策略路由规则总是有最高优先级。
static u32 fib4_rule_default_pref(struct fib_rules_ops *ops)
{
struct list_head *pos;
struct fib_rule *rule;
if (!list_empty(&ops->rules_list)) {
pos = ops->rules_list.next;
if (pos->next != &ops->rules_list) {
rule = list_entry(pos->next, struct fib_rule, list);
if (rule->pref)
return rule->pref - 1;
}
}
return 0;
}
IPv4策略路由配置回调: fib4_rule_configure()
该回调使用配置参数tb将IPv4协议特有的策略路由规则字段进行设置。
static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
struct nlmsghdr *nlh, struct fib_rule_hdr *frh, struct nlattr **tb)
{
struct net *net = sock_net(skb->sk);
int err = -EINVAL;
struct fib4_rule *rule4 = (struct fib4_rule *) rule; // IPv4策略路由规则对象
if (frh->tos & ~IPTOS_TOS_MASK) // TOS的保留位必须位0
goto errout;
// 如果策略路由规则的action是查询路由表,但是又没有指定要查询的路由表ID时,
// 会找一个尚未使用的路由表ID,然后使用该ID创建一个空的路由表
if (rule->table == RT_TABLE_UNSPEC) {
if (rule->action == FR_ACT_TO_TBL) {
struct fib_table *table;
// 创建一个新的空路由表
table = fib_empty_table(net);
if (table == NULL) {
err = -ENOBUFS;
goto errout;
}
rule->table = table->tb_id;
}
}
// 设置目的IP和源IP
if (frh->src_len)
rule4->src = nla_get_be32(tb[FRA_SRC]);
if (frh->dst_len)
rule4->dst = nla_get_be32(tb[FRA_DST]);
#ifdef CONFIG_NET_CLS_ROUTE
if (tb[FRA_FLOW])
rule4->tclassid = nla_get_u32(tb[FRA_FLOW]);
#endif
// 设置IPv4协议路由规则其它字段
rule4->src_len = frh->src_len;
rule4->srcmask = inet_make_mask(rule4->src_len);
rule4->dst_len = frh->dst_len;
rule4->dstmask = inet_make_mask(rule4->dst_len);
rule4->tos = frh->tos;
err = 0;
errout:
return err;
}
策略路由的删除: fib_nl_delrule()
static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
{
struct net *net = sock_net(skb->sk);
struct fib_rule_hdr *frh = nlmsg_data(nlh);
struct fib_rules_ops *ops = NULL;
struct fib_rule *rule, *tmp;
struct nlattr *tb[FRA_MAX+1];
int err = -EINVAL;
// 前面几步和fib_nl_newrule()完全相同
if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
goto errout;
ops = lookup_rules_ops(net, frh->family);
if (ops == NULL) {
err = -EAFNOSUPPORT;
goto errout;
}
err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy);
if (err < 0)
goto errout;
err = validate_rulemsg(frh, tb, ops);
if (err < 0)
goto errout;
// 遍历策略路由规则链表,寻找要删除的策略路由规则
list_for_each_entry(rule, &ops->rules_list, list) {
// 如果删除条件中指定了action,则action要相等
if (frh->action && (frh->action != rule->action))
continue;
// 如果删除条件中指定了table,则table要相同
if (frh->table && (frh_get_table(frh, tb) != rule->table))
continue;
// 如果删除条件中指定了优先级,那么优先级要相等
if (tb[FRA_PRIORITY] && (rule->pref != nla_get_u32(tb[FRA_PRIORITY])))
continue;
// 如果删除条件中指定了网络设备对象,那么要相同
if (tb[FRA_IFNAME] && nla_strcmp(tb[FRA_IFNAME], rule->ifname))
continue;
// 如果删除条件中指定了fwmark,那么fwmark要相同
if (tb[FRA_FWMARK] && (rule->mark != nla_get_u32(tb[FRA_FWMARK])))
continue;
if (tb[FRA_FWMASK] && (rule->mark_mask != nla_get_u32(tb[FRA_FWMASK])))
continue;
// 调用协议族的compare()回调,比较协议族自己的扩展字段是否相同
if (!ops->compare(rule, frh, tb))
continue;
// 不可删除路由不可以被删除
if (rule->flags & FIB_RULE_PERMANENT) {
err = -EPERM;
goto errout;
}
// 到此,策略路由规则匹配了,将该规则从链表中摘除
list_del_rcu(&rule->list);
// 递减goto类型的路由规则计数器
if (rule->action == FR_ACT_GOTO)
ops->nr_goto_rules--;
/*
* Check if this rule is a target to any of them. If so,
* disable them. As this operation is eventually very
* expensive, it is only performed if goto rules have
* actually been added.
*/
// 该规则有可能是其它规则的goto target,处理这些指向关系和计数器
if (ops->nr_goto_rules > 0) {
list_for_each_entry(tmp, &ops->rules_list, list) {
if (tmp->ctarget == rule) {
rcu_assign_pointer(tmp->ctarget, NULL);
ops->unresolved_rules++;
}
}
}
synchronize_rcu();
// 向用户态通知策略路由规则删除事件
notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).pid);
fib_rule_put(rule);
// 刷新路由缓存
flush_route_cache(ops);
rules_ops_put(ops);
return 0;
}
err = -ENOENT;
errout:
rules_ops_put(ops);
return err;
}
IPv4策略路由规则比较回调: fib4_rule_compare()
就是逐个字段进行比较。
static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
struct nlattr **tb)
{
struct fib4_rule *rule4 = (struct fib4_rule *) rule;
if (frh->src_len && (rule4->src_len != frh->src_len))
return 0;
if (frh->dst_len && (rule4->dst_len != frh->dst_len))
return 0;
if (frh->tos && (rule4->tos != frh->tos))
return 0;
#ifdef CONFIG_NET_CLS_ROUTE
if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW])))
return 0;
#endif
if (frh->src_len && (rule4->src != nla_get_be32(tb[FRA_SRC])))
return 0;
if (frh->dst_len && (rule4->dst != nla_get_be32(tb[FRA_DST])))
return 0;
return 1;
}