先理清几个概念
/proc/sys/net/ipv4/ip_forward
等价于下面的 /proc/sys/net/conf/all/forwarding
/proc/sys/net/ipv4/conf/xx/forwarding
针对指定的dev xx进行设置
/proc/sys/net/ipv4/conf/all/forwarding
需要对当前所有device进行设置,当然不同的属性,all 的逻辑不一样,需要one by one的的看
/proc/sys/net/ipv4/conf/default/forwarding
新创建的dev(例如重启),会根据default下的配置初始化,对除了forwarding
也是相同的作用
配置下发
static struct ctl_table ctl_forward_entry[] = {
{
.procname = "ip_forward",
.data = &ipv4_devconf.data[
IPV4_DEVCONF_FORWARDING - 1],
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = devinet_sysctl_forward,
.extra1 = &ipv4_devconf,
.extra2 = &init_net,
},
{ },
};
static int devinet_sysctl_forward(struct ctl_table *ctl, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
int *valp = ctl->data;
int val = *valp;
loff_t pos = *ppos;
int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
//值有变动, 0->1 or 1->0
if (write && *valp != val) {
struct net *net = ctl->extra2;
// != default, 那么就是all或者/proc/sys/net/ipv4/ip_forward了,需要对当前所有device进行设置
if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) {
//重试,不关心
if (!rtnl_trylock()) {
/* Restore the original values before restarting */
*valp = val;
*ppos = pos;
return restart_syscall();
}
// == ALL,那么循环遍历当前所有设备进行设置,实际调用inet_forward_change完成
if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) {
inet_forward_change(net);
} else {
// /proc/sys/net/ipv4/conf/xx/forwarding,针对指定dev进行处理
// extra1 都是 设备 自身初始化是,调用 devinet_sysctl_register 传入的
struct ipv4_devconf *cnf = ctl->extra1;
struct in_device *idev =
container_of(cnf, struct in_device, cnf);
if (*valp)
dev_disable_lro(idev->dev);
inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
NETCONFA_FORWARDING,
idev->dev->ifindex,
cnf);
}
rtnl_unlock();
rt_cache_flush(net);
} else//这里是 default 的处理
inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
NETCONFA_FORWARDING,
NETCONFA_IFINDEX_DEFAULT,
net->ipv4.devconf_dflt);
}
return ret;
}
/proc/sys/net/ipv4/ip_forward
等于 /proc/sys/net/ipv4/conf/all/forwarding
的原因是 tbl 的 extra1指向了 all
static __net_init int devinet_init_net(struct net *net)
{
int err;
struct ipv4_devconf *all, *dflt;
struct ctl_table *tbl = ctl_forward_entry;
struct ctl_table_header *forw_hdr;
err = -ENOMEM;
all = &ipv4_devconf;
dflt = &ipv4_devconf_dflt;
tbl = kmemdup(tbl, sizeof(ctl_forward_entry), GFP_KERNEL);
if (!tbl)
goto err_alloc_ctl;
...
tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1];
tbl[0].extra1 = all;
tbl[0].extra2 = net;
...
err = -ENOMEM;
forw_hdr = register_net_sysctl(net, "net/ipv4", tbl);
if (!forw_hdr)
goto err_reg_ctl;
net->ipv4.forw_hdr = forw_hdr;
net->ipv4.devconf_all = all;
net->ipv4.devconf_dflt = dflt;
return 0;
...
}
转发流程
至此,配置下发完了,来看看如何影响转发流程的
static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
u8 tos, struct net_device *dev,
struct fib_result *res)
{
err = fib_lookup(net, &fl4, res, 0);
...
if (res->type == RTN_BROADCAST) {
if (IN_DEV_BFORWARD(in_dev))
goto make_route;
goto brd_input;
}
//命中本地路由,是上送本机的请求,goto出去
if (res->type == RTN_LOCAL) {
err = fib_validate_source(skb, saddr, daddr, tos,
0, dev, in_dev, &itag);
if (err < 0)
goto martian_source;
goto local_input;
}
// 到这里说明不是本地路由,判断是否开启了forwarding
if (!IN_DEV_FORWARD(in_dev)) {
err = -EHOSTUNREACH;
goto no_route;
}
相应的错误计数,在/proc/net/stat/rt_cache
中可以获得
再来看看IN_DEV_FORWARD
宏是怎么做的
#define IN_DEV_FORWARD(in_dev) IN_DEV_CONF_GET((in_dev), FORWARDING)
#define IN_DEV_CONF_GET(in_dev, attr) \
ipv4_devconf_get((in_dev), IPV4_DEVCONF_ ## attr)
static inline int ipv4_devconf_get(struct in_device *in_dev, int index)
{
index--;
return in_dev->cnf.data[index];
}
实际就是 配置下发时的值。
以 IPVS举例,IPVS的请求,必然是需要走IPVS所在机器的LOCAL_IN
的,所以无需ip_forward参数开启,在DR或者tunnel模式下,响应包是不需要返回到IPVS所在机器,自然也不需要ip_forward参数。
针对NAT模式,如果没做SNAT,响应包总是以src:RS_IP -> dst: CLI_lP
到达 IPVS所在机器,CLI_lP
由于不是 IPVS所在机器 所在机器的IP,就会走ip_forward,IPVS在ip_forward中挂了钩子处理把RS_IP替换为VIP,所以在NAT模式中,开启ip_forward是必要的。