执行完钩子函数后,IP数据包被传送到ip_rcv_finish做进一步处理,这个函数主要功能是做路由选择(kernel/net/ipv4/ip_input.c)
static int ip_rcv_finish(struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
/*
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
*/
if (skb_dst(skb) == NULL) {
int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
iph->tos, skb->dev);
if (unlikely(err)) {
if (err == -EHOSTUNREACH)
IP_INC_STATS_BH(dev_net(skb->dev),
IPSTATS_MIB_INADDRERRORS);
else if (err == -ENETUNREACH)
IP_INC_STATS_BH(dev_net(skb->dev),
IPSTATS_MIB_INNOROUTES);
else if (err == -EXDEV)
NET_INC_STATS_BH(dev_net(skb->dev),
LINUX_MIB_IPRPFILTER);
goto drop;
}
}
#ifdef CONFIG_IP_ROUTE_CLASSID
if (unlikely(skb_dst(skb)->tclassid)) {
struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
u32 idx = skb_dst(skb)->tclassid;
st[idx&0xFF].o_packets++;
st[idx&0xFF].o_bytes += skb->len;
st[(idx>>16)&0xFF].i_packets++;
st[(idx>>16)&0xFF].i_bytes += skb->len;
}
#endif
if (iph->ihl > 5 && ip_rcv_options(skb))
goto drop;
rt = skb_rtable(skb);
if (rt->rt_type == RTN_MULTICAST) {
IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST,
skb->len);
} else if (rt->rt_type == RTN_BROADCAST)
IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST,
skb->len);
return dst_input(skb);
drop:
kfree_skb(skb);
return NET_RX_DROP;
}
我们现看(kernel/include/net/route.h)ip_route_input_noref()--->(kernel/net/route.c)ip_route_input_common()
int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
u8 tos, struct net_device *dev, bool noref)
{
struct rtable * rth;
unsigned hash;
int iif = dev->ifindex;
struct net *net;
int res;
net = dev_net(dev);
rcu_read_lock();
if (!rt_caching(net))
goto skip_cache;
tos &= IPTOS_RT_MASK;
hash = rt_hash(daddr, saddr, iif, rt_genid(net));
//给进来的包skb查找路由,现在缓冲区的路由hash表中查找,如果没找到,在调用ip_route_input_slow()到路由表中查找。
for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
rth = rcu_dereference(rth->dst.rt_next)) {
if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
(rth->rt_route_iif ^ iif) |
(rth->rt_key_tos ^ tos)) == 0 &&
rth->rt_mark == skb->mark &&
net_eq(dev_net(rth->dst.dev), net) &&
!rt_is_expired(rth)) {
ipv4_validate_peer(rth);
.....
}
res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
rcu_read_unlock();
return res;
}
static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
u8 tos, struct net_device *dev)
{
struct fib_result res;
struct in_device *in_dev = __in_dev_get_rcu(dev);
struct flowi4 fl4;
unsigned flags = 0;
u32 itag = 0;
struct rtable * rth;
unsigned hash;
__be32 spec_dst;
int err = -EINVAL;
struct net * net = dev_net(dev);
....
if (!IN_DEV_FORWARD(in_dev))
goto e_hostunreach;
if (res.type != RTN_UNICAST)
goto martian_destination;
err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);//不是发给本机数据,这里处理路由转发
out: return err;
......
local_input:
rth = rt_dst_alloc(net->loopback_dev,
IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
if (!rth)
goto e_nobufs;
rth->dst.input= ip_local_deliver;//这里表示这个数据包是发给本机的,可以传送到传输层
rth->dst.output= ip_rt_bug;
#ifdef CONFIG_IP_ROUTE_CLASSID
rth->dst.tclassid = itag;
#endif
....
}
static int ip_mkroute_input(struct sk_buff *skb,
struct fib_result *res,
const struct flowi4 *fl4,
struct in_device *in_dev,
__be32 daddr, __be32 saddr, u32 tos)
{
struct rtable* rth = NULL;
int err;
unsigned hash;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (res->fi && res->fi->fib_nhs > 1)
fib_select_multipath(res);
#endif
/* create a routing cache entry */
err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
if (err)
return err;
/* put it into the cache */
hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
rt_genid(dev_net(rth->dst.dev)));
rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
if (IS_ERR(rth))
return PTR_ERR(rth);
return 0;
}
static int __mkroute_input(struct sk_buff *skb,
const struct fib_result *res,
struct in_device *in_dev,
__be32 daddr, __be32 saddr, u32 tos,
struct rtable **result)
{
struct rtable *rth;
int err;
struct in_device *out_dev;
unsigned int flags = 0;
__be32 spec_dst;
u32 itag;
/* get a working reference to the output device */
out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
if (out_dev == NULL) {
if (net_ratelimit())
pr_crit("Bug in ip_route_input_slow(). Please report.\n");
return -EINVAL;
}
err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
in_dev->dev, &spec_dst, &itag);
if (err < 0) {
ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
saddr);
goto cleanup;
}
if (err)
flags |= RTCF_DIRECTSRC;
if (out_dev == in_dev && err &&
(IN_DEV_SHARED_MEDIA(out_dev) ||
inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
flags |= RTCF_DOREDIRECT;
if (skb->protocol != htons(ETH_P_IP)) {
/* Not IP (i.e. ARP). Do not create route, if it is
* invalid for proxy arp. DNAT routes are always valid.
*
* Proxy arp feature have been extended to allow, ARP
* replies back to the same interface, to support
* Private VLAN switch technologies. See arp.c.
*/
if (out_dev == in_dev &&
IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
err = -EINVAL;
goto cleanup;
}
}
rth = rt_dst_alloc(out_dev->dev,
IN_DEV_CONF_GET(in_dev, NOPOLICY),
IN_DEV_CONF_GET(out_dev, NOXFRM));
if (!rth) {
err = -ENOBUFS;
goto cleanup;
}
rth->rt_key_dst = daddr;
rth->rt_key_src = saddr;
rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
rth->rt_flags = flags;
rth->rt_type = res->type;
rth->rt_key_tos = tos;
rth->rt_dst = daddr;
rth->rt_src = saddr;
rth->rt_route_iif = in_dev->dev->ifindex;
rth->rt_iif = in_dev->dev->ifindex;
rth->rt_oif = 0;
rth->rt_mark = skb->mark;
rth->rt_gateway = daddr;
rth->rt_spec_dst= spec_dst;
rth->rt_peer_genid = 0;
rth->peer = NULL;
rth->fi = NULL;
//这里设置了dst.input,在ip_rcv_finish最后会调用return dst_input(skb);实际就是调用了 ip_forward;
rth->dst.input = ip_forward;
rth->dst.output = ip_output;
rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
*result = rth;
err = 0;
cleanup:
return err;
}
我们先分析发给本机数据包的情况,即 rth->dst.input= ip_local_deliver
int ip_local_deliver(struct sk_buff *skb)
{
/*
* Reassemble IP fragments.
*/
if (ip_is_fragment(ip_hdr(skb))) {
if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
return 0;
}
return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
ip_local_deliver_finish);
}
static int ip_local_deliver_finish(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
__skb_pull(skb, ip_hdrlen(skb)); //(1) 剥离IP头
/* Point into the IP datagram, just past the header. */
skb_reset_transport_header(skb);
rcu_read_lock();
{
int protocol = ip_hdr(skb)->protocol;
int hash, raw;
const struct net_protocol *ipprot;
resubmit:
raw = raw_local_deliver(skb, protocol);
//inet_protos[]是INETSOCKET层全部协议的接收处理函数集的一个数组,
hash = protocol & (MAX_INET_PROTOS - 1);
ipprot = rcu_dereference(inet_protos[hash]);
if (ipprot != NULL) {
int ret;
if (!net_eq(net, &init_net) && !ipprot->netns_ok) {
if (net_ratelimit())
printk("%s: proto %d isn't netns-ready\n",
__func__, protocol);
kfree_skb(skb);
goto out;
}
if (!ipprot->no_policy) {
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
kfree_skb(skb);
goto out;
}
nf_reset(skb);
}
ret = ipprot->handler(skb);//此时转到传输层处理数据,若为TCP协议则调用tcp_v4_rcv
if (ret < 0) {
protocol = -ret;
goto resubmit;
}
IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
} else {
if (!raw) {
if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS);
icmp_send(skb, ICMP_DEST_UNREACH,
ICMP_PROT_UNREACH, 0);
}
} else
IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
kfree_skb(skb);
}
}
out:
rcu_read_unlock();
return 0;
}
static int ip_rcv_finish(struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
/*
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
*/
if (skb_dst(skb) == NULL) {
int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
iph->tos, skb->dev);
if (unlikely(err)) {
if (err == -EHOSTUNREACH)
IP_INC_STATS_BH(dev_net(skb->dev),
IPSTATS_MIB_INADDRERRORS);
else if (err == -ENETUNREACH)
IP_INC_STATS_BH(dev_net(skb->dev),
IPSTATS_MIB_INNOROUTES);
else if (err == -EXDEV)
NET_INC_STATS_BH(dev_net(skb->dev),
LINUX_MIB_IPRPFILTER);
goto drop;
}
}
#ifdef CONFIG_IP_ROUTE_CLASSID
if (unlikely(skb_dst(skb)->tclassid)) {
struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
u32 idx = skb_dst(skb)->tclassid;
st[idx&0xFF].o_packets++;
st[idx&0xFF].o_bytes += skb->len;
st[(idx>>16)&0xFF].i_packets++;
st[(idx>>16)&0xFF].i_bytes += skb->len;
}
#endif
if (iph->ihl > 5 && ip_rcv_options(skb))
goto drop;
rt = skb_rtable(skb);
if (rt->rt_type == RTN_MULTICAST) {
IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST,
skb->len);
} else if (rt->rt_type == RTN_BROADCAST)
IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST,
skb->len);
return dst_input(skb);
drop:
kfree_skb(skb);
return NET_RX_DROP;
}
我们现看(kernel/include/net/route.h)ip_route_input_noref()--->(kernel/net/route.c)ip_route_input_common()
int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
u8 tos, struct net_device *dev, bool noref)
{
struct rtable * rth;
unsigned hash;
int iif = dev->ifindex;
struct net *net;
int res;
net = dev_net(dev);
rcu_read_lock();
if (!rt_caching(net))
goto skip_cache;
tos &= IPTOS_RT_MASK;
hash = rt_hash(daddr, saddr, iif, rt_genid(net));
//给进来的包skb查找路由,现在缓冲区的路由hash表中查找,如果没找到,在调用ip_route_input_slow()到路由表中查找。
for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
rth = rcu_dereference(rth->dst.rt_next)) {
if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
(rth->rt_route_iif ^ iif) |
(rth->rt_key_tos ^ tos)) == 0 &&
rth->rt_mark == skb->mark &&
net_eq(dev_net(rth->dst.dev), net) &&
!rt_is_expired(rth)) {
ipv4_validate_peer(rth);
.....
}
res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
rcu_read_unlock();
return res;
}
static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
u8 tos, struct net_device *dev)
{
struct fib_result res;
struct in_device *in_dev = __in_dev_get_rcu(dev);
struct flowi4 fl4;
unsigned flags = 0;
u32 itag = 0;
struct rtable * rth;
unsigned hash;
__be32 spec_dst;
int err = -EINVAL;
struct net * net = dev_net(dev);
....
if (!IN_DEV_FORWARD(in_dev))
goto e_hostunreach;
if (res.type != RTN_UNICAST)
goto martian_destination;
err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);//不是发给本机数据,这里处理路由转发
out: return err;
......
local_input:
rth = rt_dst_alloc(net->loopback_dev,
IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
if (!rth)
goto e_nobufs;
rth->dst.input= ip_local_deliver;//这里表示这个数据包是发给本机的,可以传送到传输层
rth->dst.output= ip_rt_bug;
#ifdef CONFIG_IP_ROUTE_CLASSID
rth->dst.tclassid = itag;
#endif
....
}
static int ip_mkroute_input(struct sk_buff *skb,
struct fib_result *res,
const struct flowi4 *fl4,
struct in_device *in_dev,
__be32 daddr, __be32 saddr, u32 tos)
{
struct rtable* rth = NULL;
int err;
unsigned hash;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (res->fi && res->fi->fib_nhs > 1)
fib_select_multipath(res);
#endif
/* create a routing cache entry */
err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
if (err)
return err;
/* put it into the cache */
hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
rt_genid(dev_net(rth->dst.dev)));
rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
if (IS_ERR(rth))
return PTR_ERR(rth);
return 0;
}
static int __mkroute_input(struct sk_buff *skb,
const struct fib_result *res,
struct in_device *in_dev,
__be32 daddr, __be32 saddr, u32 tos,
struct rtable **result)
{
struct rtable *rth;
int err;
struct in_device *out_dev;
unsigned int flags = 0;
__be32 spec_dst;
u32 itag;
/* get a working reference to the output device */
out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
if (out_dev == NULL) {
if (net_ratelimit())
pr_crit("Bug in ip_route_input_slow(). Please report.\n");
return -EINVAL;
}
err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
in_dev->dev, &spec_dst, &itag);
if (err < 0) {
ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
saddr);
goto cleanup;
}
if (err)
flags |= RTCF_DIRECTSRC;
if (out_dev == in_dev && err &&
(IN_DEV_SHARED_MEDIA(out_dev) ||
inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
flags |= RTCF_DOREDIRECT;
if (skb->protocol != htons(ETH_P_IP)) {
/* Not IP (i.e. ARP). Do not create route, if it is
* invalid for proxy arp. DNAT routes are always valid.
*
* Proxy arp feature have been extended to allow, ARP
* replies back to the same interface, to support
* Private VLAN switch technologies. See arp.c.
*/
if (out_dev == in_dev &&
IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
err = -EINVAL;
goto cleanup;
}
}
rth = rt_dst_alloc(out_dev->dev,
IN_DEV_CONF_GET(in_dev, NOPOLICY),
IN_DEV_CONF_GET(out_dev, NOXFRM));
if (!rth) {
err = -ENOBUFS;
goto cleanup;
}
rth->rt_key_dst = daddr;
rth->rt_key_src = saddr;
rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
rth->rt_flags = flags;
rth->rt_type = res->type;
rth->rt_key_tos = tos;
rth->rt_dst = daddr;
rth->rt_src = saddr;
rth->rt_route_iif = in_dev->dev->ifindex;
rth->rt_iif = in_dev->dev->ifindex;
rth->rt_oif = 0;
rth->rt_mark = skb->mark;
rth->rt_gateway = daddr;
rth->rt_spec_dst= spec_dst;
rth->rt_peer_genid = 0;
rth->peer = NULL;
rth->fi = NULL;
//这里设置了dst.input,在ip_rcv_finish最后会调用return dst_input(skb);实际就是调用了 ip_forward;
rth->dst.input = ip_forward;
rth->dst.output = ip_output;
rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
*result = rth;
err = 0;
cleanup:
return err;
}
我们先分析发给本机数据包的情况,即 rth->dst.input= ip_local_deliver
int ip_local_deliver(struct sk_buff *skb)
{
/*
* Reassemble IP fragments.
*/
if (ip_is_fragment(ip_hdr(skb))) {
if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
return 0;
}
return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
ip_local_deliver_finish);
}
static int ip_local_deliver_finish(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
__skb_pull(skb, ip_hdrlen(skb)); //(1) 剥离IP头
/* Point into the IP datagram, just past the header. */
skb_reset_transport_header(skb);
rcu_read_lock();
{
int protocol = ip_hdr(skb)->protocol;
int hash, raw;
const struct net_protocol *ipprot;
resubmit:
raw = raw_local_deliver(skb, protocol);
//inet_protos[]是INETSOCKET层全部协议的接收处理函数集的一个数组,
.handler = tcp_v4_rcv,
.err_handler = tcp_v4_err,
.gso_send_check = tcp_v4_gso_send_check,
.gso_segment = tcp_tso_segment,
.gro_receive = tcp4_gro_receive,
.gro_complete = tcp4_gro_complete,
.no_policy = 1,
.netns_ok = 1,
};
在kernel/net/ipv4/af_inet.c的函数inet_init中已经通过inet_add_protocol将tcp_protocol 协议加入到数组inet_protos中
hash = protocol & (MAX_INET_PROTOS - 1);
ipprot = rcu_dereference(inet_protos[hash]);
if (ipprot != NULL) {
int ret;
if (!net_eq(net, &init_net) && !ipprot->netns_ok) {
if (net_ratelimit())
printk("%s: proto %d isn't netns-ready\n",
__func__, protocol);
kfree_skb(skb);
goto out;
}
if (!ipprot->no_policy) {
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
kfree_skb(skb);
goto out;
}
nf_reset(skb);
}
ret = ipprot->handler(skb);//此时转到传输层处理数据,若为TCP协议则调用tcp_v4_rcv
if (ret < 0) {
protocol = -ret;
goto resubmit;
}
IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
} else {
if (!raw) {
if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS);
icmp_send(skb, ICMP_DEST_UNREACH,
ICMP_PROT_UNREACH, 0);
}
} else
IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
kfree_skb(skb);
}
}
out:
rcu_read_unlock();
return 0;
}