linux路由内核实现分析(四)---路由缓存机制

路由缓存机制

 

rtable结构

 

struct rtable

{

       union

       {

              struct dst_entry      dst;

       } u;

       struct flowi            fl;

       struct in_device      *idev;

       unsigned         rt_flags;

       __u16                   rt_type;

 

       __be32                  rt_dst;    

       __be32                  rt_src;    

       int                  rt_iif;

       __be32                  rt_gateway;

       __be32                  rt_spec_dst;

       struct inet_peer      *peer;

};

 

unsigned         rt_flags;//一些结构性的标志,例如,RTF_UP表示这条路由可用

__u16            rt_type;//表明了目标地址的类型,例如RTN_LOCAL,RTN_MULTICAST

__be 32 rt_dst       用来存放目标的IP地址

__be32 rt_src        路由路径的起点ip地址

__be32                  rt_gateway;//该成员存放了网关信息

struct flowi               fl;//存放的是查找该路由节点的哈希值,该哈希值用源IP,目的地址,TOS一起确定

 

struct in_device              *idev;// 该指针指向egress设备的IP配置块。注意对送往本地的ingress报文的路由,设置的egress设备为loopback设备

 

struct inet_peer             *peer; //用于long-living ip peer,虽然普通的IP报文没有状态,但是内核会记录IP报文的一些信息以提高效率,主要是记录IP报文的packet-id以检查是否收到了重复的报文,还需要检查packet-id的增量。

 

 

rt_cache_bucket结构

 

struct rt_hash_bucket {

       struct rtable    *chain;

};

 

// struct rtable           *chain; //是一个struct rtable类型的指针,struct rtable用于描述         一条完整的路由缓存项

 

在2.4版内核中,还有一个spin_lock成员用于哈希链的读写锁,2.6内核专门创建了一个spin_lock表来进行替代。

dst_entry结构

 

struct dst_entry

{

       struct rcu_head             rcu_head;

       struct dst_entry      *child;

       struct net_device       *dev;

       short                     error;

       short                     obsolete;

       int                  flags;

#define DST_HOST             1

#define DST_NOXFRM        2

#define DST_NOPOLICY            4

#define DST_NOHASH         8

       unsigned long         expires;

 

       unsigned short              header_len;    

       unsigned short              nfheader_len; 

       unsigned short              trailer_len;     

 

       u32                metrics[RTAX_MAX];

       struct dst_entry      *path;

 

       unsigned long         rate_last;

       unsigned long         rate_tokens;

 

       struct neighbour     *neighbour;

       struct hh_cache            *hh;

       struct xfrm_state    *xfrm;

 

       int                  (*input)(struct sk_buff*);

       int                  (*output)(struct sk_buff*);

 

#ifdef CONFIG_NET_CLS_ROUTE

       __u32                   tclassid;

#endif

 

       struct  dst_ops             *ops;

 

       unsigned long         lastuse;

       atomic_t         __refcnt;

       int                  __use;

       union {

              struct dst_entry *next;

              struct rtable    *rt_next;

              struct rt6_info   *rt6_next;

              struct dn_route  *dn_next;

       };

       char               info[0];

};

 

struct net_device       *dev; //用于网络接口的指针

int                  __use;        //该表项已经被使用的次数(即缓存查找返回该表项的次数)

 

 

 

short                     error; //当fib_lookup失败时,错误值被保存在error(用一个正值)

中,在ip_error成员中使用该值来决定如何处理本次路由查找失败(即决定生成哪一类ICMP消息)。

 

 

struct neighbour     *neighbour; //这个路由路径上下一个neighbour结构的指针。

 

int                  (*input)(struct sk_buff*);

int                  (*output)(struct sk_buff*);

//input 和 output 分别对应着在IP层的输入和输出函数,output对应着下一层发送数据的函数(dev_queue_xmit),input对应着向上一层发送数据的函数(例如tcp_recv)

 

 

这几个结构的关系如下:

n(max)= rt_hash_mask struct rtable struct rtable

现在我们可以看到路由缓存HASH表的构成了:)

+-------------+  +-----------------+  +--------------+  +--------------+
|rt_hash_table|->|rt_hash_bucket[0]|->| u.dst |->| u.dst |-> ~~~
+-------------+ +-----------------+ +--------------+ +--------------+
|rt_hash_bucket[1]| ~ ~ ~ ~
+-----------------+ +--------------+ +--------------+
|rt_hash_bucket[2]| | peer | | peer |
+-----------------+ +--------------+ +--------------+
~ ~
~ ~ struct rtable
+-----------------+ +--------------+
|rt_hash_bucket[n]|->| u.dst |-> ~~~
+-----------------+ +--------------+
~ ~
+--------------+
| peer |
+--------------+

linux路由内核实现分析(四)---路由缓存机制(2)

 

用rt_hash_bucket来管理一张哈希表,存储了rtable结构,rtable结构中内含的dst_entry结构可以找出邻居节点,用以确认next hop

在函数ip_rt_init( )中进行了rt_hash_bucket的初始化,代码如下:

 rt_hash_table = (struct rt_hash_bucket *)

              alloc_large_system_hash("IP route cache",

                                   sizeof(struct rt_hash_bucket),

                                   rhash_entries,

                                   (num_physpages >= 128 * 1024) ?

                                   15 : 17,

                                   0,

                                   &rt_hash_log,

                                   &rt_hash_mask,

                                   0);

       memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));

 

使用alloc_large_system_hash函数初始化了一个哈希表

在该函数中还创建了slab句柄,用于存储具体的r_table项

ipv4_dst_ops.kmem_cachep =

                        kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,

                              SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);

 

 

路由缓存查找过程

 

数据包进入网络层之后,第一个调用的函数是ip_  rcv函数,通过pskb_may_pull完成碎片重组之后还要进行一些有效性检查,最后调用

return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL,

               ip_rcv_finish);

 

进入ip_rcv_finish函数,ip_rcv_finish函数开始就调用了ip_route_input以生成路由信息。

ip_route_input函数

 

int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,

                 u8 tos, struct net_device *dev)

{

       struct rtable * rth;

       unsigned  hash;

       int iif = dev->ifindex;

 

       tos &= IPTOS_RT_MASK;

       //根据源地址,目的地址,还有网卡的index来生成哈希值,如果

       //是2.4.x的内核,还需要加上tos.

       hash = rt_hash(daddr, saddr, iif);

       rcu_read_lock();

 

       //通过相应的哈希值,调用rcu_dereference得到RCU保护的临界指针指向一个

       //rt_hash_buck中的rtable,进行遍历查找,一直到rtable链表尾部

       for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;

            rth = rcu_dereference(rth->u.dst.rt_next)) {

              if (rth->fl.fl4_dst == daddr &&

                  rth->fl.fl4_src == saddr &&

                  rth->fl.iif == iif &&

                  rth->fl.oif == 0 &&

                  rth->fl.mark == skb->mark &&

                  rth->fl.fl4_tos == tos) {

                     rth->u.dst.lastuse = jiffies;

                     dst_hold(&rth->u.dst);

                     rth->u.dst.__use++;

                     RT_CACHE_STAT_INC(in_hit);

                     rcu_read_unlock();

                     skb->dst = (struct dst_entry*)rth;

                     return 0;

                     //找到相匹配的路由信息,将skb->dst赋值为找到的dst_entry,然后

                     //直接return

              }

              RT_CACHE_STAT_INC(in_hlist_search);

       }

       rcu_read_unlock();

 

       if (MULTICAST(daddr)) {

              struct in_device *in_dev;

 

              rcu_read_lock();

              if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {

                     int our = ip_check_mc(in_dev, daddr, saddr,

                            ip_hdr(skb)->protocol);

                     if (our

#ifdef CONFIG_IP_MROUTE

                         || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))

#endif

                         ) {

                            rcu_read_unlock();

                            return ip_route_input_mc(skb, daddr, saddr,

                                                  tos, dev, our);

                     }

              }

              rcu_read_unlock();

              return -EINVAL;

       }

       //如果没有查找到相对应的路由信息,就使用这个函数

       //进行路由查找,这个函数将调用fib_lookup来查找相对应

       //的FIB

       return ip_route_input_slow(skb, daddr, saddr, tos, dev);

}

 

 

ip_route_input_slow函数

 

static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,

                            u8 tos, struct net_device *dev)

{

 

    ………………………………………………………..

    ………………………………………………………..

       调用fib_lookup查找路由项目,将结果放在res中

      if ((err = fib_lookup(&fl, &res)) != 0) {

              if (!IN_DEV_FORWARD(in_dev))

                     goto e_hostunreach;

              goto no_route;

       }

       free_res = 1;

 

       RT_CACHE_STAT_INC(in_slow_tot);

 

       //如果是广播路由

       if (res.type == RTN_BROADCAST)

              goto brd_input;

 

       //如果是本地路由

       if (res.type == RTN_LOCAL) {

              int result;

              //确定来源是否正确,来源不是广播地址或者本地地址

result = fib_validate_source(saddr, daddr, tos,

                                        loopback_dev.ifindex,

                                        dev, &spec_dst, &itag);

              if (result < 0)

                     goto martian_source;

              if (result)

                     flags |= RTCF_DIRECTSRC;

              spec_dst = daddr;

              goto local_input;

       }

 

       //如果当前系统并不处于FORWARD状态

if (!IN_DEV_FORWARD(in_dev))

              goto e_hostunreach;

       if (res.type != RTN_UNICAST)

              goto martian_destination;

       //这个函数设置skb->dst->input=ip_forward

       err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);

       if (err == -ENOBUFS)

              goto e_nobufs;

       if (err == -EINVAL)

              goto e_inval;

 

done:

       in_dev_put(in_dev);

       if (free_res)

              fib_res_put(&res);

out:  return err;

 

brd_input:

       if (skb->protocol != htons(ETH_P_IP))

              goto e_inval;

 

       if (ZERONET(saddr))

              spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);

       else {

              err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,

                                     &itag);

              if (err < 0)

                     goto martian_source;

              if (err)

                     flags |= RTCF_DIRECTSRC;

       }

       flags |= RTCF_BROADCAST;

       res.type = RTN_BROADCAST;

       RT_CACHE_STAT_INC(in_brd);

 

local_input:

       //生成一条新的rtable信息,用于下次路由缓存

       rth = dst_alloc(&ipv4_dst_ops);

……………………………………………………………………………………..

……………………………………………………………………………………..

}

 

 

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值