1 内核中分层模型的实现
由于内核网络子系统处理大量特定于协议的细节和微妙地方,只有分层[每层功能单一]并通过层层之间的大量的标准化的函数指针这种架构方式实现,才能尽可能的简洁高效,但也使得代码路径变得不不清晰
自顶向下分层:
用户空间 应用程序+c标准库
内核应用层 struct socket[面向用户] + struct sock[面向硬件]
内核传输层 struct proto
内核网络层 struct packet_type |特定于协议
内核主机到网络层 dev.c|struct net_device|drive.c <--> 物理传输
2 数据传输方式的封装
一个以太网帧结构:
MAC首部|ip首部|tcp首部|http首部|html数据
<-以太网帧净荷
<-ip净荷
<-tcp净荷
3 网络命名空间
命名空间类似c++语言的namespace机制,利用命名空间,建立系统的多个虚拟视图,彼此隔离,使每个实例看起来像一台运行的linux服务器。大多数计算机目前只需要一个命名空间,由全局变量init_net描述。在linux2.6.32版本中,命名空间结构如下:
struct net {
atomic_t count; /* To decided when the network namespace should be freed.*/
#ifdef NETNS_REFCNT_DEBUG
atomic_t use_count; /* To track references we
* destroy on demand
*/
#endif
struct list_head list; /* list of network namespaces */ 头部为net_namespace_list
struct list_head cleanup_list; /* namespaces on death row */
struct list_head exit_list; /* Use only net_mutex */
struct proc_dir_entry *proc_net; //该namespace /proc/net项结构
struct proc_dir_entry *proc_net_stat;
#ifdef CONFIG_SYSCTL
struct ctl_table_set sysctls;
#endif
struct net_device *loopback_dev; /* The loopback */
struct list_head dev_base_head; //挂在该命名空间中的所有的网络设备net_device
struct hlist_head *dev_name_head;
struct hlist_head *dev_index_head;
/* core fib_rules */
struct list_head rules_ops;
spinlock_t rules_mod_lock;
struct sock *rtnl; /* rtnetlink socket */
struct sock *genl_sock;
struct netns_core core; //特定于协议的结构
struct netns_mib mib;
struct netns_packet packet;
struct netns_unix unx;
struct netns_ipv4 ipv4;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
struct netns_ipv6 ipv6;
#endif
#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE)
struct netns_dccp dccp;
#endif
#ifdef CONFIG_NETFILTER
struct netns_xt xt;
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
struct netns_ct ct;
#endif
struct sock *nfnl;
struct sock *nfnl_stash;
#endif
#ifdef CONFIG_XFRM
struct netns_xfrm xfrm;
#endif
#ifdef CONFIG_WEXT_CORE
struct sk_buff_head wext_nlevents;
#endif
struct net_generic *gen;
};
4 套接字缓冲区
每次发送或接受数据时,该数据对应一个套接字缓冲区(struct sk_buff),通过网络子系统各个层对其进行处理,而无需来回复制,从而大幅提高性能,套接字缓冲区的基本思想:通过操作指针来增删协议首部
/**
* struct sk_buff - socket buffer
* @next: Next buffer in list
* @prev: Previous buffer in list
* @sk: Socket we are owned by
* @tstamp: Time we arrived
* @dev: Device we arrived on/are leaving by
* @transport_header: Transport layer header
* @network_header: Network layer header
* @mac_header: Link layer header
* @_skb_refdst: destination entry (with norefcount bit)
* @sp: the security path, used for xfrm
* @cb: Control buffer. Free for use by every layer. Put private vars here
* @len: Length of actual data
* @data_len: Data length
* @mac_len: Length of link layer header
* @hdr_len: writable header length of cloned skb
* @csum: Checksum (must include start/offset pair)
* @csum_start: Offset from skb->head where checksumming should start
* @csum_offset: Offset from csum_start where checksum should be stored
* @local_df: allow local fragmentation
* @cloned: Head may be cloned (check refcnt to be sure)
* @nohdr: Payload reference only, must not modify header
* @pkt_type: Packet class
* @fclone: skbuff clone status
* @ip_summed: Driver fed us an IP checksum
* @priority: Packet queueing priority
* @users: User count - see {datagram,tcp}.c
* @protocol: Packet protocol from driver
* @truesize: Buffer size
* @head: Head of buffer
* @data: Data head pointer
* @tail: Tail pointer
* @end: End pointer
* @destructor: Destruct function
* @mark: Generic packet mark
* @nfct: Associated connection, if any
* @ipvs_property: skbuff is owned by ipvs
* @peeked: this packet has been seen already, so stats have been
* done for it, don't do them again
* @nf_trace: netfilter packet trace flag
* @nfctinfo: Relationship of this skb to the connection
* @nfct_reasm: netfilter conntrack re-assembly pointer
* @nf_bridge: Saved data about a bridged frame - see br_netfilter.c
* @skb_iif: ifindex of device we arrived on
* @rxhash: the packet hash computed on receive
* @queue_mapping: Queue mapping for multiqueue devices
* @tc_index: Traffic control index
* @tc_verd: traffic control verdict
* @ndisc_nodetype: router type (from link layer)
* @dma_cookie: a cookie to one of several possible DMA operations
* done by skb DMA functions
* @secmark: security marking
* @vlan_tci: vlan tag control information
*/
struct sk_buff {
/* These two members must be first. */
struct sk_buff *next; //将套接字缓冲区连接到sk_buff_head双向链表中
struct sk_buff *prev;
ktime_t tstamp; //报文收到时时间
struct sock *sk; //发出时有效
struct net_device *dev; //网口设备
/*
* This is the control buffer. It is free to use for every
* layer. Please put your private variables there. If you
* want to keep them across layers you have to do a skb_clone()
* first. This is owned by whoever has the skb queued ATM.
*/
char cb[48] __aligned(8); //用于在协议栈之间传递参数
unsigned long _skb_refdst;
#ifdef CONFIG_XFRM
struct sec_path *sp;
#endif
unsigned int len, //存储区数据长度+分片长度之和
data_len; //分片长度之和
__u16 mac_len, //mac头长度
hdr_len;
union {
__wsum csum;
struct {
__u16 csum_start;
__u16 csum_offset;
};
};
__u32 priority;
kmemcheck_bitfield_begin(flags1);
__u8 local_df:1,
cloned:1,
ip_summed:2,
nohdr:1,
nfctinfo:3;
__u8 pkt_type:3,//数据报类型,最高位=1 组播,发送到其他主机,广播
fclone:2,
ipvs_property:1,
peeked:1,
nf_trace:1;
kmemcheck_bitfield_end(flags1);
__be16 protocol; //帧类型,ETH_P_IP(ip),ETH_P_ARP(arp数据报)
void (*destructor)(struct sk_buff *skb);
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
struct nf_conntrack *nfct;
struct sk_buff *nfct_reasm;
#endif
#ifdef CONFIG_BRIDGE_NETFILTER
struct nf_bridge_info *nf_bridge;
#endif
int skb_iif;
#ifdef CONFIG_NET_SCHED
__u16 tc_index; /* traffic control index */
#ifdef CONFIG_NET_CLS_ACT
__u16 tc_verd; /* traffic control verdict */
#endif
#endif
__u32 rxhash;
kmemcheck_bitfield_begin(flags2);
__u16 queue_mapping:16;
#ifdef CONFIG_IPV6_NDISC_NODETYPE
__u8 ndisc_nodetype:2,
deliver_no_wcard:1;
#else
__u8 deliver_no_wcard:1;
#endif
kmemcheck_bitfield_end(flags2);
/* 0/14 bit hole */
#ifdef CONFIG_NET_DMA
dma_cookie_t dma_cookie;
#endif
#ifdef CONFIG_NETWORK_SECMARK
__u32 secmark;
#endif
union {
__u32 mark;
__u32 dropcount;
};
__u16 vlan_tci;
/*typedef unsigned int sk_buff_data_t*/
sk_buff_data_t transport_header; //传输层头部[该结构在sk_buff缓冲区的偏移大小]
sk_buff_data_t network_header; //网络层头部
sk_buff_data_t mac_header; //链路层头部
/* These elements must be at the end, see alloc_skb() for details. */
//协议数据区域的其实和结束位置[mac+ip+tcp+内容,当数据在不同协议层传输时,这两部分是动态变化的,描述当前协议数据区域的实际位置]
sk_buff_data_t tail;
unsigned char*data;
sk_buff_data_t end;
unsigned char *head; //指向数据在内存起始位置和结束位置(大于实际需要的长度)
unsigned int truesize;
atomic_t users;
};
4-1 从该套接字缓冲区中获取协议头部结构
static inline struct tcphdr* tcp_hdr(const struct sk_buff* skb)
{return (struct tcphdr*)skb->transport_header}
类似有:udp_hdr,ip_hdr,
4-2 操作sk_buff的标准函数:
分配一个sk_buff结构然后分配一个size大小的数据缓冲区
static inline struct sk_buff * alloc_skb(unsigned int size,gfp_t priority)
{
return __alloc_skb(size, priority, 0, -1);
}
/**
* __alloc_skb - allocate a network buffer
* @size: size to allocate
* @gfp_mask: allocation mask
* @fclone: allocate from fclone cache instead of head cache
* and allocate a cloned (child) skb
* @node: numa node to allocate memory on
*
* Allocate a new &sk_buff. The returned buffer has no headroom and a
* tail room of size bytes. The object has a reference count of one.
* The return is the buffer. On a failure the return is %NULL.
*
* Buffers may only be allocated from interrupts using a @gfp_mask of
* %GFP_ATOMIC.
*/
struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, int fclone , int node)
{
struct kmem_cache *cache;
struct skb_shared_info *shinfo;
struct sk_buff *skb;
u8 *data;
cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
/* Get the HEAD */ 一般从skbuff_head_cache高速缓冲头部分配skb_struct结构大小数据区
skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
if (!skb)
goto out;
prefetchw(skb);
size = SKB_DATA_ALIGN(size); //该size大小空间,skb_share_info紧接该sk_buf->end[size尾]
data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
gfp_mask, node);
if (!data)
goto nodata;
prefetchw(data + size);
/*
* Only clear those fields we need to clear, not those that we will
* actually initialise below. Hence, don't put any more fields after
* the tail pointer in struct sk_buff!
*/
memset(skb, 0, offsetof(struct sk_buff, tail)); //sk_buff结构tail元素前全清0
skb->truesize = size + sizeof(struct sk_buff); //sk_buff总大小=size+sizeof(sk_buff)
atomic_set(&skb->users, 1);
skb->head = data; //指向数据内存区开始
skb->data = data;
skb_reset_tail_pointer(skb); //开始时,由于数据内存区没有数据,所以tail和data都指向头
skb->end = skb->tail + size; //指向数据内存区末尾
kmemcheck_annotate_bitfield(skb, flags1);
kmemcheck_annotate_bitfield(skb, flags2);
#ifdef NET_SKBUFF_DATA_USES_OFFSET
skb->mac_header = ~0U;
#endif
/* make sure we initialize shinfo sequentially */
shinfo = skb_shinfo(skb); //紧邻数据内存区后面的skb_shared_info结构初始化
memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
atomic_set(&shinfo->dataref, 1);
if (fclone) {
struct sk_buff *child = skb + 1;
atomic_t *fclone_ref = (atomic_t *) (child + 1);
kmemcheck_annotate_bitfield(child, flags1);
kmemcheck_annotate_bitfield(child, flags2);
skb->fclone = SKB_FCLONE_ORIG;
atomic_set(fclone_ref, 1);
child->fclone = SKB_FCLONE_UNAVAILABLE;
}
out:
return skb;
nodata:
kmem_cache_free(cache, skb);
skb = NULL;
goto out;
}
struct sk_buff* skb_copy(const struct sk_buff* skb,gfp_t mask):新建一个sk_buff结构,复制数据缓冲内容
struct sk_buff* skb_clone(struct sk_buff* skb,gfp_t mask):新建一个sk_buff结构,共享数据缓冲内容
skb_tailroom() return int (skb->end-skb->tail)
skb_headroom() return int (skb->data-skb->head)
skb_realloc_headroom() 在数据起始处创建更多空闲空间,现存数据不变