ip_rcv_finish

ip_rcv_finish  

2013-04-20 14:37:14|  分类: linux-NET |举报 |字号 订阅

执行完钩子函数后,IP数据包被传送到ip_rcv_finish做进一步处理,这个函数主要功能是做路由选择(kernel/net/ipv4/ip_input.c)
static int ip_rcv_finish(struct sk_buff *skb)
{
    const struct iphdr *iph = ip_hdr(skb);
    struct rtable *rt;

    /*
     *    Initialise the virtual path cache for the packet. It describes
     *    how the packet travels inside Linux networking.
     */
    if (skb_dst(skb) == NULL) {
        int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
                           iph->tos, skb->dev);
        if (unlikely(err)) {
            if (err == -EHOSTUNREACH)
                IP_INC_STATS_BH(dev_net(skb->dev),
                        IPSTATS_MIB_INADDRERRORS);
            else if (err == -ENETUNREACH)
                IP_INC_STATS_BH(dev_net(skb->dev),
                        IPSTATS_MIB_INNOROUTES);
            else if (err == -EXDEV)
                NET_INC_STATS_BH(dev_net(skb->dev),
                         LINUX_MIB_IPRPFILTER);
            goto drop;
        }
    }

#ifdef CONFIG_IP_ROUTE_CLASSID
    if (unlikely(skb_dst(skb)->tclassid)) {
        struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
        u32 idx = skb_dst(skb)->tclassid;
        st[idx&0xFF].o_packets++;
        st[idx&0xFF].o_bytes += skb->len;
        st[(idx>>16)&0xFF].i_packets++;
        st[(idx>>16)&0xFF].i_bytes += skb->len;
    }
#endif

    if (iph->ihl > 5 && ip_rcv_options(skb))
        goto drop;

    rt = skb_rtable(skb);
    if (rt->rt_type == RTN_MULTICAST) {
        IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST,
                skb->len);
    } else if (rt->rt_type == RTN_BROADCAST)
        IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST,
                skb->len);

    return dst_input(skb);

drop:
    kfree_skb(skb);
    return NET_RX_DROP;
}
我们现看(kernel/include/net/route.h)ip_route_input_noref()--->(kernel/net/route.c)ip_route_input_common()


int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
               u8 tos, struct net_device *dev, bool noref)
{
    struct rtable * rth;
    unsigned    hash;
    int iif = dev->ifindex;
    struct net *net;
    int res;

    net = dev_net(dev);

    rcu_read_lock();

    if (!rt_caching(net))
        goto skip_cache;

    tos &= IPTOS_RT_MASK;
    hash = rt_hash(daddr, saddr, iif, rt_genid(net));
//给进来的包skb查找路由,现在缓冲区的路由hash表中查找,如果没找到,在调用ip_route_input_slow()到路由表中查找。
    for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
         rth = rcu_dereference(rth->dst.rt_next)) {
        if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) |
             ((__force u32)rth->rt_key_src ^ (__force u32)saddr) |
             (rth->rt_route_iif ^ iif) |
             (rth->rt_key_tos ^ tos)) == 0 &&
            rth->rt_mark == skb->mark &&
            net_eq(dev_net(rth->dst.dev), net) &&
            !rt_is_expired(rth)) {
            ipv4_validate_peer(rth);
           .....

    }
    res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
    rcu_read_unlock();
    return res;
}
static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                   u8 tos, struct net_device *dev)
{
    struct fib_result res;
    struct in_device *in_dev = __in_dev_get_rcu(dev);
    struct flowi4    fl4;
    unsigned    flags = 0;
    u32        itag = 0;
    struct rtable * rth;
    unsigned    hash;
    __be32        spec_dst;
    int        err = -EINVAL;
    struct net    * net = dev_net(dev);
....

    if (!IN_DEV_FORWARD(in_dev))
        goto e_hostunreach;
    if (res.type != RTN_UNICAST)
        goto martian_destination;

    err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);//不是发给本机数据,这里处理路由转发
out:    return err;
......
local_input:
    rth = rt_dst_alloc(net->loopback_dev,
               IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
    if (!rth)
        goto e_nobufs;

    rth->dst.input= ip_local_deliver;//这里表示这个数据包是发给本机的,可以传送到传输层
    rth->dst.output= ip_rt_bug;
#ifdef CONFIG_IP_ROUTE_CLASSID
    rth->dst.tclassid = itag;
#endif
....
}
static int ip_mkroute_input(struct sk_buff *skb,
                struct fib_result *res,
                const struct flowi4 *fl4,
                struct in_device *in_dev,
                __be32 daddr, __be32 saddr, u32 tos)
{
    struct rtable* rth = NULL;
    int err;
    unsigned hash;

#ifdef CONFIG_IP_ROUTE_MULTIPATH
    if (res->fi && res->fi->fib_nhs > 1)
        fib_select_multipath(res);
#endif

    /* create a routing cache entry */
    err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
    if (err)
        return err;

    /* put it into the cache */
    hash = rt_hash(daddr, saddr, fl4->flowi4_iif,
               rt_genid(dev_net(rth->dst.dev)));
    rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif);
    if (IS_ERR(rth))
        return PTR_ERR(rth);
    return 0;
}
static int __mkroute_input(struct sk_buff *skb,
               const struct fib_result *res,
               struct in_device *in_dev,
               __be32 daddr, __be32 saddr, u32 tos,
               struct rtable **result)
{
    struct rtable *rth;
    int err;
    struct in_device *out_dev;
    unsigned int flags = 0;
    __be32 spec_dst;
    u32 itag;

    /* get a working reference to the output device */
    out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
    if (out_dev == NULL) {
        if (net_ratelimit())
            pr_crit("Bug in ip_route_input_slow(). Please report.\n");
        return -EINVAL;
    }


    err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
                  in_dev->dev, &spec_dst, &itag);
    if (err < 0) {
        ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
                     saddr);

        goto cleanup;
    }

    if (err)
        flags |= RTCF_DIRECTSRC;

    if (out_dev == in_dev && err &&
        (IN_DEV_SHARED_MEDIA(out_dev) ||
         inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
        flags |= RTCF_DOREDIRECT;

    if (skb->protocol != htons(ETH_P_IP)) {
        /* Not IP (i.e. ARP). Do not create route, if it is
         * invalid for proxy arp. DNAT routes are always valid.
         *
         * Proxy arp feature have been extended to allow, ARP
         * replies back to the same interface, to support
         * Private VLAN switch technologies. See arp.c.
         */
        if (out_dev == in_dev &&
            IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
            err = -EINVAL;
            goto cleanup;
        }
    }

    rth = rt_dst_alloc(out_dev->dev,
               IN_DEV_CONF_GET(in_dev, NOPOLICY),
               IN_DEV_CONF_GET(out_dev, NOXFRM));
    if (!rth) {
        err = -ENOBUFS;
        goto cleanup;
    }

    rth->rt_key_dst    = daddr;
    rth->rt_key_src    = saddr;
    rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
    rth->rt_flags = flags;
    rth->rt_type = res->type;
    rth->rt_key_tos    = tos;
    rth->rt_dst    = daddr;
    rth->rt_src    = saddr;
    rth->rt_route_iif = in_dev->dev->ifindex;
    rth->rt_iif     = in_dev->dev->ifindex;
    rth->rt_oif     = 0;
    rth->rt_mark    = skb->mark;
    rth->rt_gateway    = daddr;
    rth->rt_spec_dst= spec_dst;
    rth->rt_peer_genid = 0;
    rth->peer = NULL;
    rth->fi = NULL;
//这里设置了dst.input,在ip_rcv_finish最后会调用return dst_input(skb);实际就是调用了 ip_forward;
    rth->dst.input = ip_forward;
    rth->dst.output = ip_output;


    rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);

    *result = rth;
    err = 0;
 cleanup:
    return err;
}
我们先分析发给本机数据包的情况,即 rth->dst.input= ip_local_deliver
int ip_local_deliver(struct sk_buff *skb)
{
    /*
     *    Reassemble IP fragments.
     */

    if (ip_is_fragment(ip_hdr(skb))) {
        if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
            return 0;
    }

    return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
               ip_local_deliver_finish);
}
static int ip_local_deliver_finish(struct sk_buff *skb)
{
    struct net *net = dev_net(skb->dev);

    __skb_pull(skb, ip_hdrlen(skb)); //(1) 剥离IP头

    /* Point into the IP datagram, just past the header. */
    skb_reset_transport_header(skb);

    rcu_read_lock();
    {
        int protocol = ip_hdr(skb)->protocol;
        int hash, raw;
        const struct net_protocol *ipprot;

    resubmit:
        raw = raw_local_deliver(skb, protocol);
   //inet_protos[]是INETSOCKET层全部协议的接收处理函数集的一个数组,

        //每个协议以自己的协议号(比如icmp协议就以IPPROTO_ICMP,也就是数字

       //1作为下标,把自己添加到该数组,数组中存储了net_protocol结构,例如针

       //对TCP协议,相关的结构定义如下:

       

      /----------------------------------------------------

      static const struct net_protocol tcp_protocol = {
    .handler =    tcp_v4_rcv,
    .err_handler =    tcp_v4_err,
    .gso_send_check = tcp_v4_gso_send_check,
    .gso_segment =    tcp_tso_segment,
    .gro_receive =    tcp4_gro_receive,
    .gro_complete =    tcp4_gro_complete,
    .no_policy =    1,
    .netns_ok =    1,
};

在kernel/net/ipv4/af_inet.c的函数inet_init中已经通过inet_add_protocol将tcp_protocol 协议加入到数组inet_protos中

       -------------------------------------------------------/


        hash = protocol & (MAX_INET_PROTOS - 1);
        ipprot = rcu_dereference(inet_protos[hash]);
        if (ipprot != NULL) {
            int ret;

            if (!net_eq(net, &init_net) && !ipprot->netns_ok) {
                if (net_ratelimit())
                    printk("%s: proto %d isn't netns-ready\n",
                        __func__, protocol);
                kfree_skb(skb);
                goto out;
            }

            if (!ipprot->no_policy) {
                if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
                    kfree_skb(skb);
                    goto out;
                }
                nf_reset(skb);
            }
             ret = ipprot->handler(skb);//此时转到传输层处理数据,若为TCP协议则调用tcp_v4_rcv
            if (ret < 0) {
                protocol = -ret;
                goto resubmit;
            }
            IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
        } else {
            if (!raw) {
                if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
                    IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS);
                    icmp_send(skb, ICMP_DEST_UNREACH,
                          ICMP_PROT_UNREACH, 0);
                }
            } else
                IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
            kfree_skb(skb);
        }
    }
 out:
    rcu_read_unlock();

    return 0;
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值