http://blog.csdn.net/myarrow/article/details/9251635
1. ISO参考模型与TCP/IP参考模型
2. 在kernel中如何管理网络包(Network Packets)
2.1 定义Socket Buffers
Socket Buffers 由以下两部分组成:
1) Packet data: 它是在网络上传输的数据,其存储位置对应的PDU (Protocol Data Unit)
2) Management data: 当包在kernel中进行处理时,kernel需要另外一些数据,如pointer, timers等,它们是协议实体间交换信息的ICI(Interface Control Information)
Socket Buffer构成如下图所示:
在Kernel处理过程中,网络数据以Socket Buffer的形式存在。
当app通过socket发送数据时,socket将创建一个对应的socket buffer,并把需要发送的数据(payload)放于其中。当它通过各个协议层时,每一层的包头将被插入到payload的前面,在创建socket buffer时,为包头预留了足够空间。按此方案,payload被copy两次:
1) 从用户空间copy到kernel空间
2) 发送数据到network adapter
在协议层间传递时,其数据变化如下图所示:
Socket Buffer数据结构如下所示:
- struct sk_buff {
- /* These two members must be first. */
- struct sk_buff *next;
- struct sk_buff *prev;
- ktime_t tstamp;
- struct sock *sk;
- struct net_device *dev;
- /*
- * This is the control buffer. It is free to use for every
- * layer. Please put your private variables there. If you
- * want to keep them across layers you have to do a skb_clone()
- * first. This is owned by whoever has the skb queued ATM.
- */
- char cb[48] __aligned(8);
- unsigned long _skb_refdst;
- #ifdef CONFIG_XFRM
- struct sec_path *sp;
- #endif
- unsigned int len,
- data_len;
- __u16 mac_len,
- hdr_len;
- union {
- __wsum csum;
- struct {
- __u16 csum_start;
- __u16 csum_offset;
- };
- };
- __u32 priority;
- kmemcheck_bitfield_begin(flags1);
- __u8 local_df:1,
- cloned:1,
- ip_summed:2,
- nohdr:1,
- nfctinfo:3;
- __u8 pkt_type:3,
- fclone:2,
- ipvs_property:1,
- peeked:1,
- nf_trace:1;
- kmemcheck_bitfield_end(flags1);
- __be16 protocol;
- void (*destructor)(struct sk_buff *skb);
- #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
- struct nf_conntrack *nfct;
- #endif
- #ifdef NET_SKBUFF_NF_DEFRAG_NEEDED
- struct sk_buff *nfct_reasm;
- #endif
- #ifdef CONFIG_BRIDGE_NETFILTER
- struct nf_bridge_info *nf_bridge;
- #endif
- int skb_iif;
- #ifdef CONFIG_NET_SCHED
- __u16 tc_index; /* traffic control index */
- #ifdef CONFIG_NET_CLS_ACT
- __u16 tc_verd; /* traffic control verdict */
- #endif
- #endif
- __u32 rxhash;
- __u16 queue_mapping;
- kmemcheck_bitfield_begin(flags2);
- #ifdef CONFIG_IPV6_NDISC_NODETYPE
- __u8 ndisc_nodetype:2;
- #endif
- __u8 ooo_okay:1;
- kmemcheck_bitfield_end(flags2);
- /* 0/13 bit hole */
- #ifdef CONFIG_NET_DMA
- dma_cookie_t dma_cookie;
- #endif
- #ifdef CONFIG_NETWORK_SECMARK
- __u32 secmark;
- #endif
- union {
- __u32 mark;
- __u32 dropcount;
- };
- __u16 vlan_tci;
- sk_buff_data_t transport_header; //传输层头
- sk_buff_data_t network_header; //网络层头
- sk_buff_data_t mac_header; //链路层头
- /* These elements must be at the end, see alloc_skb() for details. */
- sk_buff_data_t tail;
- sk_buff_data_t end;
- unsigned char *head,
- *data;
- unsigned int truesize;
- atomic_t users;
- }
对Socket Buffers的操作分为以下三类:
1) 创建、释放和复制sokcet buffers
2) 操作sk_buff结构中的参数和指针,主要是改变包数据空间的操作
3) 管理socket buffer队列
2.1.1 创建、释放和复制sokcet buffers
其相关函数如下所示:
alloc_skb() include/linux/skbuff.h
dev_alloc_skb() net/core/skbuff.c
skb_copy() net/core/skbuff.c
skb_copy_expand() net/core/skbuff.c
skb_clone() net/core/skbuff.c
kfree_skb() net/core/skbuff.c
dev_kfree_skb() include/linux/skbuff.h
kfree_skbmem() net/core/skbuff.c
2.1.2 操作包数据空间
include/linux/skbuff.h
skb_get()
skb_unshare()
skb_put()
skb_push()
skb_pull()
skb_tailroom()
skb_headroom()
skb_realloc_headroom()
skb_reserve()
skb_trim()
skb_cow()
2.2 定义Socket-Buffer Queues
如果Socket Buffer不是正在被处理,则它被sk_buff_head管理,它通过双向链表进行管理,如下图所示:
- struct sk_buff_head {
- /* These two members must be first. */
- struct sk_buff *next;
- struct sk_buff *prev;
- __u32 qlen;
- spinlock_t lock;
- };
2.2.1 队列操作
include/linux/skbuff.h
skb_queue_head_init()
skb_queue_empty()
skb_queue_len()
2.2.2 队列中socket Buffer操作
include/linux/skbuff.h
skb_queue_head()
skb_queue_tail()
skb_dequeue()
skb_dequeue_tail()
skb_queue_purge()
skb_insert()
skb_append()
skb_unlink()
skb_peek()
skb_peek_tail()
3. Network Devices
在Linux系统中的网络架构,基于软件的协议(software-based protocol)与网络适配器(network adapters)间的接口通过network devices来实现。一个network-device接口需要满足以下要求:
1) 是network adapter的技术抽象
2) 提供统一的接口供协议实体访问
3.1 net_device定义
【网络设备】不同于【字符设备】和【块设备】,其主要区别如下:
1) 网络设备在/dev下不存在对应的设备名,即不可通过read和write进行读写操作
2) 网络设备基于包进行处理,且必须经过复杂协议的处理(如TCP和UDP)
net_device定义如下:
- struct net_device {
- /*
- * This is the first field of the "visible" part of this structure
- * (i.e. as seen by users in the "Space.c" file). It is the name
- * of the interface.
- */
- char name[IFNAMSIZ];
- struct pm_qos_request_list pm_qos_req;
- /* device name hash chain */
- struct hlist_node name_hlist;
- /* snmp alias */
- char *ifalias;
- // 硬件相关的信息
- /*
- * I/O specific fields
- * FIXME: Merge these and struct ifmap into one
- */
- unsigned long mem_end; /* shared mem end */
- unsigned long mem_start; /* shared mem start */
- unsigned long base_addr; /* device I/O address */
- unsigned int irq; /* device IRQ number */
- /*
- * Some hardware also needs these fields, but they are not
- * part of the usual set specified in Space.c.
- */
- unsigned long state;
- struct list_head dev_list;
- struct list_head napi_list;
- struct list_head unreg_list;
- /* currently active device features */
- u32 features;
- /* user-changeable features */
- u32 hw_features;
- /* user-requested features */
- u32 wanted_features;
- /* mask of features inheritable by VLAN devices */
- u32 vlan_features;
- /* Interface index. Unique device identifier */
- int ifindex;
- int iflink;
- struct net_device_stats stats;
- atomic_long_t rx_dropped; /* dropped packets by core network
- * Do not use this in drivers.
- */
- // 管理操作
- /* Management operations */
- const struct net_device_ops *netdev_ops; // 最终调用网络设备驱动方法
- const struct ethtool_ops *ethtool_ops;
- // 硬件头描述
- /* Hardware header description */
- const struct header_ops *header_ops;
- unsigned int flags; /* interface flags (a la BSD) */
- unsigned int priv_flags; /* Like 'flags' but invisible to userspace. */
- unsigned short gflags;
- unsigned short padded; /* How much padding added by alloc_netdev() */
- unsigned char operstate; /* RFC2863 operstate */
- unsigned char link_mode; /* mapping policy to operstate */
- unsigned char if_port; /* Selectable AUI, TP,..*/
- unsigned char dma; /* DMA channel */
- unsigned int mtu; /* interface MTU value */
- unsigned short type; /* interface hardware type */
- unsigned short hard_header_len; /* hardware hdr length */
- /* extra head- and tailroom the hardware may need, but not in all cases
- * can this be guaranteed, especially tailroom. Some cases also use
- * LL_MAX_HEADER instead to allocate the skb.
- */
- unsigned short needed_headroom;
- unsigned short needed_tailroom;
- // 接口地址信息
- /* Interface address info. */
- unsigned char perm_addr[MAX_ADDR_LEN]; /* permanent hw address */
- unsigned char addr_assign_type; /* hw address assignment type */
- unsigned char addr_len; /* hardware address length */
- unsigned short dev_id; /* for shared network cards */
- spinlock_t addr_list_lock;
- struct netdev_hw_addr_list uc; /* Unicast mac addresses */
- struct netdev_hw_addr_list mc; /* Multicast mac addresses */
- int uc_promisc;
- unsigned int promiscuity;
- unsigned int allmulti;
- // 协议相关的指针
- /* Protocol specific pointers */
- void *atalk_ptr; /* AppleTalk link */
- struct in_device __rcu *ip_ptr; /* IPv4 specific data */
- struct dn_dev __rcu *dn_ptr; /* DECnet specific data */
- struct inet6_dev __rcu *ip6_ptr; /* IPv6 specific data */
- void *ec_ptr; /* Econet specific data */
- void *ax25_ptr; /* AX.25 specific data */
- struct wireless_dev *ieee80211_ptr; /* IEEE 802.11 specific data,
- assign before registering */
- // 在接收通道中需要缓存的数据
- /*
- * Cache lines mostly used on receive path (including eth_type_trans())
- */
- unsigned long last_rx; /* Time of last Rx
- * This should not be set in
- * drivers, unless really needed,
- * because network stack (bonding)
- * use it if/when necessary, to
- * avoid dirtying this cache line.
- */
- struct net_device *master; /* Pointer to master device of a group,
- * which this device is member of.
- */
- /* Interface address info used in eth_type_trans() */
- unsigned char *dev_addr; /* hw address, (before bcast
- because most packets are
- unicast) */
- struct netdev_hw_addr_list dev_addrs; /* list of device
- hw addresses */
- unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */
- #ifdef CONFIG_RPS
- struct kset *queues_kset;
- struct netdev_rx_queue *_rx;
- /* Number of RX queues allocated at register_netdev() time */
- unsigned int num_rx_queues;
- /* Number of RX queues currently active in device */
- unsigned int real_num_rx_queues;
- #ifdef CONFIG_RFS_ACCEL
- /* CPU reverse-mapping for RX completion interrupts, indexed
- * by RX queue number. Assigned by driver. This must only be
- * set if the ndo_rx_flow_steer operation is defined. */
- struct cpu_rmap *rx_cpu_rmap;
- #endif
- #endif
- rx_handler_func_t __rcu *rx_handler;
- void __rcu *rx_handler_data;
- struct netdev_queue __rcu *ingress_queue;
- // 在发送通道中需要缓存的数据
- /*
- * Cache lines mostly used on transmit path
- */
- struct netdev_queue *_tx ____cacheline_aligned_in_smp;
- /* Number of TX queues allocated at alloc_netdev_mq() time */
- unsigned int num_tx_queues;
- /* Number of TX queues currently active in device */
- unsigned int real_num_tx_queues;
- /* root qdisc from userspace point of view */
- struct Qdisc *qdisc;
- unsigned long tx_queue_len; /* Max frames per queue allowed */
- spinlock_t tx_global_lock;
- #ifdef CONFIG_XPS
- struct xps_dev_maps __rcu *xps_maps;
- #endif
- /* These may be needed for future network-power-down code. */
- /*
- * trans_start here is expensive for high speed devices on SMP,
- * please use netdev_queue->trans_start instead.
- */
- unsigned long trans_start; /* Time (in jiffies) of last Tx */
- int watchdog_timeo; /* used by dev_watchdog() */
- struct timer_list watchdog_timer;
- /* Number of references to this device */
- int __percpu *pcpu_refcnt;
- /* delayed register/unregister */
- struct list_head todo_list;
- /* device index hash chain */
- struct hlist_node index_hlist;
- struct list_head link_watch_list;
- /* register/unregister state machine */
- enum { NETREG_UNINITIALIZED=0,
- NETREG_REGISTERED, /* completed register_netdevice */
- NETREG_UNREGISTERING, /* called unregister_netdevice */
- NETREG_UNREGISTERED, /* completed unregister todo */
- NETREG_RELEASED, /* called free_netdev */
- NETREG_DUMMY, /* dummy device for NAPI poll */
- } reg_state:8;
- bool dismantle; /* device is going do be freed */
- enum {
- RTNL_LINK_INITIALIZED,
- RTNL_LINK_INITIALIZING,
- } rtnl_link_state:16;
- /* Called from unregister, can be used to call free_netdev */
- void (*destructor)(struct net_device *dev);
- #ifdef CONFIG_NETPOLL
- struct netpoll_info *npinfo;
- #endif
- #ifdef CONFIG_NET_NS
- /* Network namespace this network device is inside */
- struct net *nd_net;
- #endif
- /* mid-layer private */
- union {
- void *ml_priv;
- struct pcpu_lstats __percpu *lstats; /* loopback stats */
- struct pcpu_tstats __percpu *tstats; /* tunnel stats */
- struct pcpu_dstats __percpu *dstats; /* dummy stats */
- };
- /* GARP */
- struct garp_port __rcu *garp_port;
- /* class/net/name entry */
- struct device dev;
- /* space for optional device, statistics, and wireless sysfs groups */
- const struct attribute_group *sysfs_groups[4];
- /* rtnetlink link ops */
- const struct rtnl_link_ops *rtnl_link_ops;
- /* for setting kernel sock attribute on TCP connection setup */
- #define GSO_MAX_SIZE 65536
- unsigned int gso_max_size;
- u8 num_tc;
- struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE];
- u8 prio_tc_map[TC_BITMASK + 1];
- /* n-tuple filter list attached to this device */
- struct ethtool_rx_ntuple_list ethtool_ntuple_list;
- /* phy device may attach itself for hardware timestamping */
- struct phy_device *phydev;
- /* group the device belongs to */
- int group;
- }
net_device是每个网络设备的基础,它不仅包含network adapter硬件信息(如:interrupt, ports, driver functions等),也包含高层网络协议的配置数据(如:IP address, subnet mask等).
在/sys/class/net下列出来所有网络设备的名字,如我的为:
shell@android:/sys/class/net # ll
lrwxrwxrwx root root 2013-07-05 17:08 ip6tnl0
lrwxrwxrwx root root 2013-07-05 17:08 lo (loopback设备)
lrwxrwxrwx root root 2013-07-05 17:08 sit0
lrwxrwxrwx root root 2000-01-01 08:00 wlan0 (Wifi设备)
3.2 管理net_device
从上面的协议实例看net_device。
3.2.1注册和注销网络设备(net_device)
位于文件:kernel/net/core/dev.c
int register_netdev(struct net_device *dev)
void unregister_netdev(struct net_device *dev)
网络设备(net_device)与一个已经存在的network adapter一一对应。
3.2.2 打开和关闭网络设备(net_device)
位于文件:kernel/net/core/dev.c
int dev_open(struct net_device *dev)
int dev_close(struct net_device *dev)
3.2.3 创建、释放和查找网络设备(net_device)
位于文件:kernel/net/core/dev.c
struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
void (*setup)(struct net_device *),
unsigned int txqs, unsigned int rxqs)
void free_netdev(struct net_device *dev)
int dev_alloc_name(struct net_device *dev, const char *name)
struct net_device *dev_get_by_index(struct net *net, int ifindex)
struct net_device *dev_get_by_name(struct net *net, const char *name)
void dev_load(struct net *net, const char *name)
3.2.4网络设备通知上层协议状态变化
int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
网络状态有如下值:
NETDEV_UP: 激活一个网络设备 (dev_open)
NETDEV_DOWN: 禁止一个网络设备 (dev_close)
NETDEV_CHANGE: 通知网络设备状态变化
NETDEV_REGISTER: 网络设备已经被注册,但是没有打开实例
NETDEV_UNREGISTER: 网络设备已经被删除
NETDEV_CHANGEMTU: 网络设备MTU被修改
NETDEV_CHANGEADDR: 网络设备硬件地址被修改
NETDEV_CHANGENAME:网络设备名字被修改
3.2.5 通过net_device发送数据
int dev_queue_xmit(struct sk_buff *skb) // kernel/net/core/dev.c
它由高层的协议实例调用,以通过一个net_device(skb->dev)发送一个socket buffer.
3.3 net_device_ops
- struct net_device_ops {
- int (*ndo_init)(struct net_device *dev);
- void (*ndo_uninit)(struct net_device *dev);
- int (*ndo_open)(struct net_device *dev);
- int (*ndo_stop)(struct net_device *dev);
- netdev_tx_t (*ndo_start_xmit) (struct sk_buff *skb,
- struct net_device *dev);
- u16 (*ndo_select_queue)(struct net_device *dev,
- struct sk_buff *skb);
- void (*ndo_change_rx_flags)(struct net_device *dev,
- int flags);
- void (*ndo_set_rx_mode)(struct net_device *dev);
- void (*ndo_set_multicast_list)(struct net_device *dev);
- int (*ndo_set_mac_address)(struct net_device *dev,
- void *addr);
- int (*ndo_validate_addr)(struct net_device *dev);
- int (*ndo_do_ioctl)(struct net_device *dev,
- struct ifreq *ifr, int cmd);
- int (*ndo_set_config)(struct net_device *dev,
- struct ifmap *map);
- int (*ndo_change_mtu)(struct net_device *dev,
- int new_mtu);
- int (*ndo_neigh_setup)(struct net_device *dev,
- struct neigh_parms *);
- void (*ndo_tx_timeout) (struct net_device *dev);
- struct rtnl_link_stats64* (*ndo_get_stats64)(struct net_device *dev,
- struct rtnl_link_stats64 *storage);
- struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);
- void (*ndo_vlan_rx_register)(struct net_device *dev,
- struct vlan_group *grp);
- void (*ndo_vlan_rx_add_vid)(struct net_device *dev,
- unsigned short vid);
- void (*ndo_vlan_rx_kill_vid)(struct net_device *dev,
- unsigned short vid);
- #ifdef CONFIG_NET_POLL_CONTROLLER
- void (*ndo_poll_controller)(struct net_device *dev);
- int (*ndo_netpoll_setup)(struct net_device *dev,
- struct netpoll_info *info);
- void (*ndo_netpoll_cleanup)(struct net_device *dev);
- #endif
- int (*ndo_set_vf_mac)(struct net_device *dev,
- int queue, u8 *mac);
- int (*ndo_set_vf_vlan)(struct net_device *dev,
- int queue, u16 vlan, u8 qos);
- int (*ndo_set_vf_tx_rate)(struct net_device *dev,
- int vf, int rate);
- int (*ndo_get_vf_config)(struct net_device *dev,
- int vf,
- struct ifla_vf_info *ivf);
- int (*ndo_set_vf_port)(struct net_device *dev,
- int vf,
- struct nlattr *port[]);
- int (*ndo_get_vf_port)(struct net_device *dev,
- int vf, struct sk_buff *skb);
- int (*ndo_setup_tc)(struct net_device *dev, u8 tc);
- #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
- int (*ndo_fcoe_enable)(struct net_device *dev);
- int (*ndo_fcoe_disable)(struct net_device *dev);
- int (*ndo_fcoe_ddp_setup)(struct net_device *dev,
- u16 xid,
- struct scatterlist *sgl,
- unsigned int sgc);
- int (*ndo_fcoe_ddp_done)(struct net_device *dev,
- u16 xid);
- int (*ndo_fcoe_ddp_target)(struct net_device *dev,
- u16 xid,
- struct scatterlist *sgl,
- unsigned int sgc);
- #define NETDEV_FCOE_WWNN 0
- #define NETDEV_FCOE_WWPN 1
- int (*ndo_fcoe_get_wwn)(struct net_device *dev,
- u64 *wwn, int type);
- #endif
- #ifdef CONFIG_RFS_ACCEL
- int (*ndo_rx_flow_steer)(struct net_device *dev,
- const struct sk_buff *skb,
- u16 rxq_index,
- u32 flow_id);
- #endif
- int (*ndo_add_slave)(struct net_device *dev,
- struct net_device *slave_dev);
- int (*ndo_del_slave)(struct net_device *dev,
- struct net_device *slave_dev);
- u32 (*ndo_fix_features)(struct net_device *dev,
- u32 features);
- int (*ndo_set_features)(struct net_device *dev,
- u32 features);
- }
4. Network Drivers
4.1 初始化网络适配器(Network Adapter)
在net_device被激活之前,我们必须找到一个匹配的network adapter。网络驱动(network driver)的初始化函数(init/probe)负责找到一个匹配的network adapter并且使用对应的信息初始化net_device。在驱动的probe函数中,主要完成以下任务:(参考:kernel/drivers/net/pci-skeleton.c)
1) 创建net_device
2) 填充相关的硬件信息
3) 调用register_netdev进行注册
4) 设置net_device->netdev_ops (netdev_ops由驱动实现)
4.2 打开、关闭网络适配器
1) 打开: ifconfig wlan0 up->ioctl->dev_open->net_device.netdev_ops.ndo_open
2) 关闭: ifconfig wlan0 down->ioctl->dev_close->net_device.netdev_ops.ndo_stop
4.3 发送数据
在驱动中实现与ndo_start_xmit对应的函数
4.4 接收数据
其流程如下图所示:
在网络驱动的中断处理函数中,首先调用dev_alloc_skb创建socket buffer,然后把接收到的数据copy到其中,最后调用netif_rx把socket buffer放入队列中,供协议层处理。
int netif_rx(struct sk_buff *skb) // kernel/net/core/dev.c
5. Data-Link Layer
1) 逻辑链路控制(LLC)层由Linux内核实现,网络适配器通过net_device连接到操作系统内核。
2) 数据链路层(Data-Link Layer)由LLC和MAC组成。LLC隐藏了所有不同传输介质的差异,从而以上层协议提供统一的接口;而MAC层则反应了不同传输技术(传输协议,如802.3与802.11不同)的差异。
逻辑链路控制 LLC (Logical Link Control)子层
媒体接入控制 MAC (Medium Access Control)子层
与接入到传输媒体有关的内容都放在 MAC子层,而LLC 子层则与传输媒体无关,不管采用何种协议的局域网对 LLC 子层来说都是透明的。
5.1 接收网络包流程
5.2 发送网络包流程