Linux网络协议栈之网络设备管理


    Linux素以优秀的网络管理能力而著称,linux为何具有这么高的效率?我们从网络设备的管理说起。

     Linux为何要对网络设备单独管理呢?这是因为。协议栈很多地方都会涉及到网络设备。小至IP地址的设置。大至IP路由的更新。都离不开高效的网络设备管理。将网络设备单独管理可以提高效率!

     每个网络设备,在linux中都会对应一个数据结构,net_device。 就从这个结构说起

Linux 2。6。21中,对net_device定义如下:

struct net_device

{

     //设备的名称,例如常见的“eth0”等

     char          name[IFNAMSIZ];

     //共享内存的起始,结束地址

     unsigned long      mem_end; /* shared mem end */

     unsigned long      mem_start;    /* shared mem start    */

     //网络设备的I/O基地址

     unsigned long      base_addr;    /* device I/O address */

     //被赋予的中断号

     unsigned int       irq;     /* device IRQ number   */

     //在多端口设备上使用哪一个端口

     unsigned char      if_port; /* Selectable AUI, TP,..*/

     //为设备分配的DMA通道

     unsigned char      dma;     /* DMA channel         */

     //设备的状态

     unsigned long      state;

     // 下一个net_device

     struct net_device *next;

     //初始化函数。

     int           (*init)(struct net_device *dev);

     struct net_device *next_sched;

 

     /* Interface index. Unique device identifier   */

     //设备在内核中对应的序号

     int           ifindex;

     int           iflink;

 

     //获得接口状态的函数指针

     struct net_device_stats* (*get_stats)(struct net_device *dev);

     struct iw_statistics* (*get_wireless_stats)(struct net_device *dev);

 

     struct iw_handler_def * wireless_handlers;

     struct ethtool_ops *ethtool_ops;

     //传输状态。检查传输是否被锁住

     unsigned long      trans_start; /* Time (in jiffies) of last Tx */

     //最使使用的时间

     unsigned long      last_rx; /* Time of last Rx */

     //接口标志

     unsigned short         flags;   /* interface flags (a la BSD)    */

     unsigned short         gflags;

        unsigned short          priv_flags; /* Like 'flags' but invisible to userspace. */

        unsigned short          unused_alignment_fixer; /* Because we need priv_flags,

                                                         * and we want to be 32-bit aligned.

                                                         */

 

     unsigned      mtu; /* interface MTU value      */

     unsigned short         type;    /* interface hardware type */

     unsigned short         hard_header_len;   /* hardware hdr length */

     void          *priv;   /* pointer to private data */

 

     struct net_device *master; /* Pointer to master device of a group,

                         * which this device is member of.

                         */

 

     /* Interface address info. */

     unsigned char      broadcast[MAX_ADDR_LEN];    /* hw bcast add    */

     unsigned char      dev_addr[MAX_ADDR_LEN]; /* hw address */

     unsigned char      addr_len; /* hardware address length */

 

     struct dev_mc_list *mc_list; /* Multicast mac addresses */

     int           mc_count; /* Number of installed mcasts    */

     int           promiscuity;

     int           allmulti;

 

     int           watchdog_timeo;

     struct timer_list watchdog_timer;

 

     /* Protocol specific pointers */

    

     void               *atalk_ptr;   /* AppleTalk link */

     void          *ip_ptr; /* IPv4 specific data */

     void                    *dn_ptr;        /* DECnet specific data */

     void                    *ip6_ptr;       /* IPv6 specific data */

     void          *ec_ptr; /* Econet specific data */

     void          *ax25_ptr;    /* AX.25 specific data */

 

     struct list_head   poll_list;    /* Link to poll list   */

     int           quota;

     int           weight;

 

     struct Qdisc       *qdisc;

     struct Qdisc       *qdisc_sleeping;

     struct Qdisc       *qdisc_ingress;

     struct list_head   qdisc_list;

     unsigned long      tx_queue_len; /* Max frames per queue allowed */

 

     /* ingress path synchronizer */

     spinlock_t         ingress_lock;

     /* hard_start_xmit synchronizer */

     spinlock_t         xmit_lock;

     /* cpu id of processor entered to hard_start_xmit or -1,

        if nobody entered there.

      */

     int           xmit_lock_owner;

     /* device queue lock */

     spinlock_t         queue_lock;

     /* Number of references to this device */

     atomic_t      refcnt;

     /* delayed register/unregister */

     struct list_head   todo_list;

     /* device name hash chain */

     struct hlist_node name_hlist;

     /* device index hash chain */

     struct hlist_node index_hlist;

 

     /* register/unregister state machine */

     enum { NETREG_UNINITIALIZED=0,

            NETREG_REGISTERING, /* called register_netdevice */

            NETREG_REGISTERED,   /* completed register todo */

            NETREG_UNREGISTERING,     /* called unregister_netdevice */

            NETREG_UNREGISTERED, /* completed unregister todo */

            NETREG_RELEASED,     /* called free_netdev */

     } reg_state;

 

     /* Net device features */

     int           features;

#define NETIF_F_SG     1    /* Scatter/gather IO. */

#define NETIF_F_IP_CSUM     2    /* Can checksum only TCP/UDP over IPv4. */

#define NETIF_F_NO_CSUM     4    /* Does not require checksum. F.e. loopack. */

#define NETIF_F_HW_CSUM     8    /* Can checksum all the packets. */

#define NETIF_F_HIGHDMA     32   /* Can DMA to high memory. */

#define NETIF_F_FRAGLIST    64   /* Scatter/gather IO. */

#define NETIF_F_HW_VLAN_TX 128 /* Transmit VLAN hw acceleration */

#define NETIF_F_HW_VLAN_RX 256 /* Receive VLAN hw acceleration */

#define NETIF_F_HW_VLAN_FILTER   512 /* Receive filtering on VLAN */

#define NETIF_F_VLAN_CHALLENGED 1024 /* Device cannot handle VLAN packets */

#define NETIF_F_TSO         2048 /* Can offload TCP/IP segmentation */

#define NETIF_F_LLTX        4096 /* LockLess TX */

 

     /* Called after device is detached from network. */

     void          (*uninit)(struct net_device *dev);

     /* Called after last user reference disappears. */

     void          (*destructor)(struct net_device *dev);

 

     /* Pointers to interface service routines.     */

     //打开函数指针

     int           (*open)(struct net_device *dev);

     //设备停用时调用此函数

     int           (*stop)(struct net_device *dev);

     //初始化数据包的传输

     int           (*hard_start_xmit) (struct sk_buff *skb,

                                struct net_device *dev);

#define HAVE_NETDEV_POLL

     //轮询函数

     int           (*poll) (struct net_device *dev, int *quota);

     //建立硬件头信息

     int           (*hard_header) (struct sk_buff *skb,

                            struct net_device *dev,

                            unsigned short type,

                            void *daddr,

                            void *saddr,

                            unsigned len);

     //ARP解析之后,重构头部

     int           (*rebuild_header)(struct sk_buff *skb);

#define HAVE_MULTICAST     

     //多播支持函数    

     void          (*set_multicast_list)(struct net_device *dev);

#define HAVE_SET_MAC_ADDR        

     int           (*set_mac_address)(struct net_device *dev,

                               void *addr);

#define HAVE_PRIVATE_IOCTL

     int           (*do_ioctl)(struct net_device *dev,

                           struct ifreq *ifr, int cmd);

#define HAVE_SET_CONFIG

     int           (*set_config)(struct net_device *dev,

                             struct ifmap *map);

#define HAVE_HEADER_CACHE

     int           (*hard_header_cache)(struct neighbour *neigh,

                                 struct hh_cache *hh);

     void          (*header_cache_update)(struct hh_cache *hh,

                                   struct net_device *dev,

                                   unsigned char * haddr);

#define HAVE_CHANGE_MTU

     int           (*change_mtu)(struct net_device *dev, int new_mtu);

 

#define HAVE_TX_TIMEOUT

     void          (*tx_timeout) (struct net_device *dev);

 

     void          (*vlan_rx_register)(struct net_device *dev,

                                struct vlan_group *grp);

     void          (*vlan_rx_add_vid)(struct net_device *dev,

                               unsigned short vid);

     void          (*vlan_rx_kill_vid)(struct net_device *dev,

                                unsigned short vid);

 

     int           (*hard_header_parse)(struct sk_buff *skb,

                                 unsigned char *haddr);

     int           (*neigh_setup)(struct net_device *dev, struct neigh_parms *);

     int           (*accept_fastpath)(struct net_device *, struct dst_entry*);

#ifdef CONFIG_NETPOLL

     int           netpoll_rx;

#endif

#ifdef CONFIG_NET_POLL_CONTROLLER

     void                    (*poll_controller)(struct net_device *dev);

#endif

 

     /* bridge stuff */

     //对应的网桥端口(以后分析)

     struct net_bridge_port *br_port;

 

#ifdef CONFIG_NET_DIVERT

     /* this will get initialized at each interface type init routine */

     struct divert_blk *divert;

#endif /* CONFIG_NET_DIVERT */

 

     /* class/net/name entry */

     struct class_device    class_dev;

     /* how much padding had been added by alloc_netdev() */

     int padded;

}

晕,太多的成员。太庞大了。不要紧,等到要使用到相应成员的时候再来解释好了。

 

注意到这么庞大的结构中,有个成员叫: struct net_device *next,呵呵,很熟悉吧,就是用它来建立网络设备的链表。

每一个网络设备启动的时候,都会调用register_netdev() (drivers/net/net_init.c)

跟踪这个函数:

int register_netdev(struct net_device *dev)

{

     int err;

 

     rtnl_lock();

 

     /*

      *   If the name is a format string the caller wants us to

      *   do a name allocation

      */

     

     if (strchr(dev->name, '%'))

     {

         err = dev_alloc_name(dev, dev->name);

         if (err < 0)

              goto out;

     }

    

     /*

      *   Back compatibility hook. Kill this one in 2.5

      */

    

     if (dev->name[0]==0 || dev->name[0]==' ')

     {

         err = dev_alloc_name(dev, "eth%d");

         if (err < 0)

              goto out;

     }

 

     err = register_netdevice(dev);

 

out:

     rtnl_unlock();

     return err;

}

 

跟踪至: register_netdevice(struct net_device *dev) (net/core/dev.c)

int register_netdevice(struct net_device *dev)

{

     struct hlist_head *head;

     struct hlist_node *p;

     int ret;

 

     BUG_ON(dev_boot_phase);

     ASSERT_RTNL();

 

     /* When net_device's are persistent, this will be fatal. */

     BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);

 

     spin_lock_init(&dev->queue_lock);

     spin_lock_init(&dev->xmit_lock);

     dev->xmit_lock_owner = -1;

#ifdef CONFIG_NET_CLS_ACT

     spin_lock_init(&dev->ingress_lock);

#endif

 

     ret = alloc_divert_blk(dev);

     if (ret)

         goto out;

 

     dev->iflink = -1;

 

     /* Init, if this function is available */

     //如果dev -> init 被赋值,那么调用此函数

     if (dev->init) {

         ret = dev->init(dev);

         if (ret) {

              if (ret > 0)

                   ret = -EIO;

              goto out_err;

         }

     }

 

     //判断name 是否合法

     if (!dev_valid_name(dev->name)) {

         ret = -EINVAL;

         goto out_err;

     }

     //为此设备分配一个index

     dev->ifindex = dev_new_index();

     if (dev->iflink == -1)

         dev->iflink = dev->ifindex;

 

     /* Check for existence of name */

 

     //所有网络设备,以名字作为哈希主键存在dev_name_head中,该变量是一个哈希数组

     //找到该名字对应的链表

     //如果内核中已经含有此名字的网络设备,出错退出

     head = dev_name_hash(dev->name);

     hlist_for_each(p, head) {

         struct net_device *d

              = hlist_entry(p, struct net_device, name_hlist);

         if (!strncmp(d->name, dev->name, IFNAMSIZ)) {

              ret = -EEXIST;

              goto out_err;

         }

     }

 

     /* Fix illegal SG+CSUM combinations. */

     if ((dev->features & NETIF_F_SG) &&

         !(dev->features & (NETIF_F_IP_CSUM |

                     NETIF_F_NO_CSUM |

                     NETIF_F_HW_CSUM))) {

         printk("%s: Dropping NETIF_F_SG since no checksum feature./n",

                dev->name);

         dev->features &= ~NETIF_F_SG;

     }

 

     /*

      *   nil rebuild_header routine,

      *   that should be never called and used as just bug trap.

      */

 

     //为rebuild_header赋默认值

     if (!dev->rebuild_header)

         dev->rebuild_header = default_rebuild_header;

 

     /*

      *   Default initial state at registry is that the

      *   device is present.

      */

 

     set_bit(__LINK_STATE_PRESENT, &dev->state);

 

     dev->next = NULL;

     dev_init_scheduler(dev);

     write_lock_bh(&dev_base_lock);

     //初始化的时候,有struct net_device **dev_tail = &dev_base;

     //这段代码的意思实际就是:把dev加入dev_base为首结点队链表的尾部

     *dev_tail = dev;

     dev_tail = &dev->next;

     //把此结点加入到以名字为哈希主键的链表数组dev_name_head中

     hlist_add_head(&dev->name_hlist, head);

     //把此结点加到以序号为主键的链表数组dev_index_head中

     hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex));

     dev_hold(dev);

     dev->reg_state = NETREG_REGISTERING;

     write_unlock_bh(&dev_base_lock);

 

     /* Notify protocols, that a new device appeared. */

     //在通知链表上发送事件

     notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);

 

     /* Finish registration after unlock */

     net_set_todo(dev);

     ret = 0;

 

out:

     return ret;

out_err:

     free_divert_blk(dev);

     goto out;

}

 

从此可以看出。新加入一个设备时,会插入三个位置:以名字为哈希值组织的dev_name_head ,以序号为主链的哈希数组dev_index_head.还有dev_base.它为快速查找网络设备提供了基础。事实上。在内核中,经常要根据index找到dev. 或者根据name找到dev.我们遇到的时候再分析

 

到现在,我们可以在内核中顺藤摸瓜的找到每一个网络设备了。

还有很重要的。设备更改了配置,要怎么通知跟他相关的子系统呢?例如,网卡更新了IP,如何使路由得到更新?

接着往下看:

注意到上面注册代码中所调用的一个函数notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev).

该函数的作用是,在通知链表上netdev_chain上发送NETDEV_REGISTER消息,所有在与该通知链表关联的子系统都可以收到此消息。以此,可以快速的更新整个系统的配置消息。

以路由子系统为例,来讲述该过程:

在IPV4子系统加载的时候,加调用ip_init(),接着调用fib_init(),然后再调用ip_fib_init()

跟踪一下此函数:

void __init ip_fib_init(void)

{

#ifndef CONFIG_IP_MULTIPLE_TABLES

     ip_fib_local_table = fib_hash_init(RT_TABLE_LOCAL);

     ip_fib_main_table = fib_hash_init(RT_TABLE_MAIN);

#else

     fib_rules_init();

#endif

 

     register_netdevice_notifier(&fib_netdev_notifier);

     register_inetaddr_notifier(&fib_inetaddr_notifier);

}

register_netdevice_notifier是做什么的呢?往下跟踪:

int register_netdevice_notifier(struct notifier_block *nb)

{

     struct net_device *dev;

     int err;

 

     rtnl_lock();

     //注册通知链

     err = notifier_chain_register(&netdev_chain, nb);

     if (!err) {

         for (dev = dev_base; dev; dev = dev->next) {

              nb->notifier_call(nb, NETDEV_REGISTER, dev);

 

              if (dev->flags & IFF_UP)

                   nb->notifier_call(nb, NETDEV_UP, dev);

         }

     }

     rtnl_unlock();

     return err;

}

呵呵,它在netdev_chain上注册了通知链,当此链上有事件发生时,会调用fib_netdev_notifiers中的相关信息处理,看一下fib_netdev_notifier的信息:

 

struct notifier_block fib_netdev_notifier = {

     .notifier_call =fib_netdev_event,

};

 

OK,现在越来越具体了,如果netdev_chain有事件,会调用fib_netdev_event处理。继续跟踪:

static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)

{

     struct net_device *dev = ptr;

     struct in_device *in_dev = __in_dev_get(dev);

     //设备注销

     if (event == NETDEV_UNREGISTER) {

         fib_disable_ip(dev, 2);

         return NOTIFY_DONE;

     }

 

     if (!in_dev)

         return NOTIFY_DONE;

 

     switch (event) {

     //设备UP

     case NETDEV_UP:

         for_ifa(in_dev) {

              fib_add_ifaddr(ifa);

         } endfor_ifa(in_dev);

#ifdef CONFIG_IP_ROUTE_MULTIPATH

         fib_sync_up(dev);

#endif

         rt_cache_flush(-1);

         break;

     //设备DOWN

     case NETDEV_DOWN:

         fib_disable_ip(dev, 0);

         break;

     //设备参数改变

     case NETDEV_CHANGEMTU:

     case NETDEV_CHANGE:

         rt_cache_flush(0);

         break;

     }

     return NOTIFY_DONE;

}



  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值