fwmark 是一个用于标识数据包的标记,它可以用来标识不同的数据流。它可以用来指定某些特定的数据包通过特定的路由表,或者用于指定某些特定的数据包应该使用特定的策略,例如指定某些数据包应该使用特定的 QoS 策略。
有掩码的 fwmark 通常被用来指定某些特定的数据包应该使用特定的策略,而不是所有的数据包都使用特定的策略。在这种情况下,fwmark 的掩码用于指定哪些数据包应该使用特定的策略。
1 iptables
1.1 MARK
MARK 是 iptables 的一个目标,用于给数据包打上指定mark
--set-mark value 设置数据包的mark值。
--and-mark value 数据包的mark值和value进行按位与运算。
--or-mark value 数据包的mark值和value进行按或与运算。
//打标记
iptables -t mangle -A PREROUTING -j MARK --set-mark 33
//or-mark 打标记
iptables -t mangle -A PREROUTING -j MARK --or-mark 0x400
//匹配标记
iptables -t nat -A PREROUTING -m mark --mark 33 -j ACCEPT
//掩码匹配标记
iptables -t nat -A PREROUTING -m mark --mark 0x400/0x400 -j ACCEPT
1.2 CONNMARK
CONNMARK 是 iptables 的一个目标,用于给一个网络连接标记一个连接 mark,也就是网络连接的整个过程的交互包都被该连接mark标记。
--set-mark value[/mask] 给链接跟踪记录打标记。
--save-mark [--mask mask] 将数据包上的mark设置到连接上上。
--restore-mark [--mask mask] 将连接mark设置到数据包中
//打标记
iptables -t mangle -A PREROUTING -j CONNMARK --set-mark 4
//save
iptables -t mangle -A PREROUTING -j CONNMARK --save-mark
//restore
iptables -t mangle -A PREROUTING -j CONNMARK --restore-mark
1.3 kernel match code(xt_mark and xt_connmark)
1.3.1 iptables 钩子函数具体执行
接之前 iptables 四表在每个钩子点的具体执行函数
//kernel/msm-5.4/net/ipv4/netfilter/ip_tables.c
unsigned int
ipt_do_table(struct sk_buff *skb,
const struct nf_hook_state *state,
struct xt_table *table)
{
//...
e = get_entry(table_base, private->hook_entry[hook]);
do {
const struct xt_entry_target *t;
const struct xt_entry_match *ematch;
struct xt_counters *counter;
WARN_ON(!e);
if (!ip_packet_match(ip, indev, outdev,
&e->ip, acpar.fragoff)) {
no_match:
e = ipt_next_entry(e);
continue;
}
xt_ematch_foreach(ematch, e) {
acpar.match = ematch->u.kernel.match;
acpar.matchinfo = ematch->data;
if (!acpar.match->match(skb, &acpar))
goto no_match;
}
t = ipt_get_target_c(e);
WARN_ON(!t->u.kernel.target);
/* Standard target? */
//...
acpar.target = t->u.kernel.target;
acpar.targinfo = t->data;
verdict = t->u.kernel.target->target(skb, &acpar);
if (verdict == XT_CONTINUE) {
/* Target might have changed stuff. */
ip = ip_hdr(skb);
e = ipt_next_entry(e);
} else {
/* Verdict */
break;
}
} while (!acpar.hotdrop);
xt_write_recseq_end(addend);
local_bh_enable();
if (acpar.hotdrop)
return NF_DROP;
else return verdict;
}
假设现在在 PREROUTING 点去执行 nat 表的操作,其实就是去调用 nat 表的钩子函数,一直向下调用,其实也就是执行上述函数。
/* Furniture shopping... */
struct xt_table {
struct list_head list;
/* What hooks you will enter on */
unsigned int valid_hooks;
/* Man behind the curtain... */
struct xt_table_info *private;
/* Set this to THIS_MODULE if you are a module, otherwise NULL */
struct module *me;
u_int8_t af; /* address/protocol family */
int priority; /* hook order */
/* called when table is needed in the given netns */
int (*table_init)(struct net *net);
/* A unique name... */
const char name[XT_TABLE_MAXNAMELEN];
};
/* The table itself */
struct xt_table_info {
/* Size per table */
unsigned int size;
/* Number of entries: FIXME. --RR */
unsigned int number;
/* Initial number of entries. Needed for module usage count */
unsigned int initial_entries;
/* Entry points and underflows */
unsigned int hook_entry[NF_INET_NUMHOOKS];
unsigned int underflow[NF_INET_NUMHOOKS];
/*
* Number of user chains. Since tables cannot have loops, at most
* @stacksize jumps (number of user chains) can possibly be made.
*/
unsigned int stacksize;
void ***jumpstack;
unsigned char entries[0] __aligned(8);
};
每个表都有对应的 xt_table 注册到了网络命名空间中(struct net),xt_table 有一个 xt_table_info 结构体成员 private,相关的规则也是添加到 private 的 entries 字段中。
/* This structure defines each of the firewall rules. Consists of 3
parts which are 1) general IP header stuff 2) match specific
stuff 3) the target to perform if the rule matches */
struct ipt_entry {
struct ipt_ip ip;
/* Mark with fields that we care about. */
unsigned int nfcache;
/* Size of ipt_entry + matches */
__u16 target_offset;
/* Size of ipt_entry + matches + target */
__u16 next_offset;
/* Back pointer */
unsigned int comefrom;
/* Packet and byte counters. */
struct xt_counters counters;
/* The matches (if any), then the target. */
unsigned char elems[0];
};
ipt_do_table 函数的具体执行流程如下:
根据特定的钩子点作为偏移,从 private 中直接定位到特定 钩子点的第一个 entry(struct ipt_entry),以此为起点开始遍历。
首先判断 判断数据包是否符合被处理的条件,例如判断 in-dev 和 out-dev 等。
接着从 entry 中首先遍历获取到 matches(struct xt_entry_match),其中最为关键的成员就是 match(struct xt_match)。xt_match 处理完以后再从 entry 中获取 xt_entry_target(如果有的话),其中最为关键的成员是 target(struct xt_target),再通过 xt_target 来处理数据包。最后将处理结果返回。
struct xt_entry_match {
union {
//user
struct {
__u16 match_size;
/* Used inside the kernel */
struct xt_match *match;
} kernel;
/* Total length */
__u16 match_size;
} u;
unsigned char data[0];
};
struct xt_entry_target {
union {
//...
struct {
__u16 target_size;
/* Used inside the kernel */
struct xt_target *target;
} kernel;
/* Total length */
__u16 target_size;
} u;
unsigned char data[0];
};
接下来重点分析 xt_match 和 xt_target。
Netfilter一个很重要的思想就是将netfilter作为一个协议无关的框架,表现在内核结构树中单独建立net/netfilter目录,在net/netfilter下的匹配和目标模块文件名称以“xt_”开头。 为了和iptables兼容,这些文件中增加了一个新的宏定义:module_alias,来表示模块的别名。所有扩展程序的名称也是以xt开头。所以 mark 和 connmark 也必然会出现在 net/netfilter 目录。
1.3.2 MARK 和 CONNMARK
接下来主要看 xt_mark 和 xt_connmark 这两个文件。
//kenrel/msm-5.4/net/netfilter/xt_mark.c
static struct xt_target mark_tg_reg __read_mostly = {
.name = "MARK",
.revision = 2,
.family = NFPROTO_UNSPEC,
.target = mark_tg,
.targetsize = sizeof(struct xt_mark_tginfo2),
.me = THIS_MODULE,
};
static struct xt_match mark_mt_reg __read_mostly = {
.name = "mark",
.revision = 1,
.family = NFPROTO_UNSPEC,
.match = mark_mt,
.matchsize = sizeof(struct xt_mark_mtinfo1),
.me = THIS_MODULE,
};
static unsigned int
mark_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_mark_tginfo2 *info = par->targinfo;
skb->mark = (skb->mark & ~info->mask) ^ info->mark;
return XT_CONTINUE;
}
static bool
mark_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_mark_mtinfo1 *info = par->matchinfo;
return ((skb->mark & info->mask) == info->mark) ^ info->invert;
}
mark_tg 是为数据包加上mark。
mark_mt 是根据 mark 来进行匹配,匹配成功返回 true,否则返回false。
//kenrel/msm-5.4/net/netfilter/xt_connmark.c
static struct xt_target connmark_tg_reg[] __read_mostly = {
{
.name = "CONNMARK",
.revision = 1,
.family = NFPROTO_UNSPEC,
.checkentry = connmark_tg_check,
.target = connmark_tg,
.targetsize = sizeof(struct xt_connmark_tginfo1),
.destroy = connmark_tg_destroy,
.me = THIS_MODULE,
},
{
.name = "CONNMARK",
.revision = 2,
.family = NFPROTO_UNSPEC,
.checkentry = connmark_tg_check,
.target = connmark_tg_v2,
.targetsize = sizeof(struct xt_connmark_tginfo2),
.destroy = connmark_tg_destroy,
.me = THIS_MODULE,
}
};
static struct xt_match connmark_mt_reg __read_mostly = {
.name = "connmark",
.revision = 1,
.family = NFPROTO_UNSPEC,
.checkentry = connmark_mt_check,
.match = connmark_mt,
.matchsize = sizeof(struct xt_connmark_mtinfo1),
.destroy = connmark_mt_destroy,
.me = THIS_MODULE,
};
static unsigned int
connmark_tg_shift(struct sk_buff *skb, const struct xt_connmark_tginfo2 *info)
{
enum ip_conntrack_info ctinfo;
u_int32_t new_targetmark;
struct nf_conn *ct;
u_int32_t newmark;
ct = nf_ct_get(skb, &ctinfo);
if (ct == NULL)
return XT_CONTINUE;
switch (info->mode) {
case XT_CONNMARK_SET:
newmark = (ct->mark & ~info->ctmask) ^ info->ctmark;
if (info->shift_dir == D_SHIFT_RIGHT)
newmark >>= info->shift_bits;
else
newmark <<= info->shift_bits;
if (ct->mark != newmark) {
ct->mark = newmark;
nf_conntrack_event_cache(IPCT_MARK, ct);
}
break;
case XT_CONNMARK_SAVE:
new_targetmark = (skb->mark & info->nfmask);
if (info->shift_dir == D_SHIFT_RIGHT)
new_targetmark >>= info->shift_bits;
else
new_targetmark <<= info->shift_bits;
newmark = (ct->mark & ~info->ctmask) ^
new_targetmark;
if (ct->mark != newmark) {
ct->mark = newmark;
nf_conntrack_event_cache(IPCT_MARK, ct);
}
break;
case XT_CONNMARK_RESTORE:
new_targetmark = (ct->mark & info->ctmask);
if (info->shift_dir == D_SHIFT_RIGHT)
new_targetmark >>= info->shift_bits;
else
new_targetmark <<= info->shift_bits;
newmark = (skb->mark & ~info->nfmask) ^
new_targetmark;
skb->mark = newmark;
break;
}
return XT_CONTINUE;
}
static unsigned int
connmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_connmark_tginfo1 *info = par->targinfo;
const struct xt_connmark_tginfo2 info2 = {
.ctmark = info->ctmark,
.ctmask = info->ctmask,
.nfmask = info->nfmask,
.mode = info->mode,
};
return connmark_tg_shift(skb, &info2);
}
static unsigned int
connmark_tg_v2(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_connmark_tginfo2 *info = par->targinfo;
return connmark_tg_shift(skb, info);
}
static bool
connmark_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_connmark_mtinfo1 *info = par->matchinfo;
enum ip_conntrack_info ctinfo;
const struct nf_conn *ct;
ct = nf_ct_get(skb, &ctinfo);
if (ct == NULL)
return false;
return ((ct->mark & info->mask) == info->mark) ^ info->invert;
}
connmark_mt 跟 mark_mt 大致差不多,但 connmark_mt 是根据 nf_conn 的 mark 来进行匹配,匹配成功返回 true,否则返回false。
connmark_tg 的处理则是分了三种情况,XT_CONNMARK_SET,XT_CONNMARK_SAVE,XT_CONNMARK_RESTORE。SET 是给连接打 mark,SAVE 是将数据包的 mark 配置到连接中,RESTORE 是将连接 mark 配置到数据包中。
1.4 Android set iptables
http://aospxref.com/android-12.0.0_r3/xref/system/netd/server/Controllers.cpp#278
2 策略路由
2.1 ip rule
anakin:/ # ip rule list
0: from all lookup local
200: from all lookup main
10000: from all fwmark 0xc0000/0xd0000 lookup legacy_system
11000: from all iif lo oif dummy0 uidrange 0-0 lookup dummy0
11000: from all iif lo oif eth0.40 uidrange 0-0 lookup eth0.40
11000: from all iif lo oif eth0.50 uidrange 0-0 lookup eth0.50
11000: from all iif lo oif eth0.51 uidrange 0-0 lookup eth0.51
15010: from all fwmark 0x10064/0x1ffff iif lo uidrange 10049-10049 lookup eth0.40
16000: from all fwmark 0x10063/0x1ffff iif lo lookup local_network
16000: from all fwmark 0x10064/0x1ffff iif lo lookup eth0.40
16000: from all fwmark 0x10065/0x1ffff iif lo lookup eth0.50
16000: from all fwmark 0x10066/0x1ffff iif lo lookup eth0.51
17000: from all iif lo oif dummy0 lookup dummy0
17000: from all iif lo oif eth0.40 lookup eth0.40
17000: from all iif lo oif eth0.50 lookup eth0.50
17000: from all iif lo oif eth0.51 lookup eth0.51
18000: from all fwmark 0x0/0x10000 lookup legacy_system
19000: from all fwmark 0x0/0x10000 lookup legacy_network
20000: from all fwmark 0x0/0x10000 lookup local_network
22010: from all fwmark 0x64/0x1ffff iif lo uidrange 10049-10049 lookup eth0.40
23000: from all fwmark 0x64/0x1ffff iif lo lookup eth0.40
23000: from all fwmark 0x65/0x1ffff iif lo lookup eth0.50
23000: from all fwmark 0x66/0x1ffff iif lo lookup eth0.51
27010: from all fwmark 0x0/0xffff iif lo uidrange 10049-10049 lookup eth0.40
29000: from all fwmark 0x0/0xffff iif lo lookup eth0.50
32000: from all unreachable
2.3 kernel code
in : ip_rcv_finish -> ip_rcv_finish_core -> ip_route_input_noref -> … -> fib_lookup -> fib_rules_lookup
out : ip_queue_xmit -> __ip_queue_xmit -> ip_route_output_ports -> … -> fib_lookup -> fib_rules_lookup
//kernel/msm-5.4/net/core/fib_rules.c
int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl,
int flags, struct fib_lookup_arg *arg)
{
struct fib_rule *rule;
int err;
rcu_read_lock();
list_for_each_entry_rcu(rule, &ops->rules_list, list) {
jumped:
if (!fib_rule_match(rule, ops, fl, flags, arg))
continue;
//...
}
//...
}
static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
struct flowi *fl, int flags,
struct fib_lookup_arg *arg)
{
int ret = 0;
if (rule->iifindex && (rule->iifindex != fl->flowi_iif))
goto out;
if (rule->oifindex && (rule->oifindex != fl->flowi_oif))
goto out;
if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask)
goto out;
if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id))
goto out;
if (rule->l3mdev && !l3mdev_fib_rule_match(rule->fr_net, fl, arg))
goto out;
if (uid_lt(fl->flowi_uid, rule->uid_range.start) ||
uid_gt(fl->flowi_uid, rule->uid_range.end))
goto out;
ret = ops->match(rule, fl, flags);
out:
return (rule->flags & FIB_RULE_INVERT) ? !ret : ret;
}
2.4 Android 添加 ip rules
http://aospxref.com/android-12.0.0_r3/xref/system/netd/server/RouteController.cpp#1150
http://aospxref.com/android-12.0.0_r3/xref/system/netd/server/RouteController.cpp#1188
2.5 Android 对网路流量进行mark
https://blog.csdn.net/woai110120130/article/details/119479392
http://aospxref.com/android-12.0.0_r3/xref/system/netd/server/FwmarkServer.cpp#101
int FwmarkServer::processClient(SocketClient* client, int* socketFd) {
//...
Fwmark fwmark;
socklen_t fwmarkLen = sizeof(fwmark.intValue);
if (getsockopt(*socketFd, SOL_SOCKET, SO_MARK, &fwmark.intValue, &fwmarkLen) == -1) {
return -errno;
}
switch (command.cmdId) {
case FwmarkCommand::ON_ACCEPT: {
// Called after a socket accept(). The kernel would've marked the NetId and necessary
// permissions bits, so we just add the rest of the user's permissions here.
permission = static_cast<Permission>(permission | fwmark.permission);
break;
}
case FwmarkCommand::ON_CONNECT: {
// Called before a socket connect() happens. Set an appropriate NetId into the fwmark so
// that the socket routes consistently over that network. Do this even if the socket
// already has a NetId, so that calling connect() multiple times still works.
//
// But if the explicit bit was set, the existing NetId was explicitly preferred (and not
// a case of connect() being called multiple times). Don't reset the NetId in that case.
//
// An "appropriate" NetId is the NetId of a bypassable VPN that applies to the user, or
// failing that, the default network. We'll never set the NetId of a secure VPN here.
// See the comments in the implementation of getNetworkForConnect() for more details.
//
// If the protect bit is set, this could be either a system proxy (e.g.: the dns proxy
// or the download manager) acting on behalf of another user, or a VPN provider. If it's
// a proxy, we shouldn't reset the NetId. If it's a VPN provider, we should set the
// default network's NetId.
//
// There's no easy way to tell the difference between a proxy and a VPN app. We can't
// use PERMISSION_SYSTEM to identify the proxy because a VPN app may also have those
// permissions. So we use the following heuristic:
//
// If it's a proxy, but the existing NetId is not a VPN, that means the user (that the
// proxy is acting on behalf of) is not subject to a VPN, so the proxy must have picked
// the default network's NetId. So, it's okay to replace that with the current default
// network's NetId (which in all likelihood is the same).
//
// Conversely, if it's a VPN provider, the existing NetId cannot be a VPN. The only time
// we set a VPN's NetId into a socket without setting the explicit bit is here, in
// ON_CONNECT, but we won't do that if the socket has the protect bit set. If the VPN
// provider connect()ed (and got the VPN NetId set) and then called protect(), we
// would've unset the NetId in PROTECT_FROM_VPN below.
//
// So, overall (when the explicit bit is not set but the protect bit is set), if the
// existing NetId is a VPN, don't reset it. Else, set the default network's NetId.
if (!fwmark.explicitlySelected) {
if (!fwmark.protectedFromVpn) {
fwmark.netId = mNetworkController->getNetworkForConnect(client->getUid());
} else if (!mNetworkController->isVirtualNetwork(fwmark.netId)) {
fwmark.netId = mNetworkController->getDefaultNetwork();
}
}
break;
}
//...
}
fwmark.permission = permission;
if (setsockopt(*socketFd, SOL_SOCKET, SO_MARK, &fwmark.intValue,
sizeof(fwmark.intValue)) == -1) {
return -errno;
}
}
通过 setsockopt 设置mark。