SKB(socket buffer)是Linux网络代码中最根本的数据结构,收发数据包都是通过skb。
struct sk_buff {
/* These two members must be first. */
struct sk_buff *next;
struct sk_buff *prev;
//首要的这两个字段是为了实现链表操作,最近的版本没有了struct sk_buff_head *list;是否意味着packet都在同一个队列上??
ktime_t tstamp;
//记录时间戳,计算这个字段代价很大,所以必要的时候才设置;
struct sock *sk;
struct sock *sk;
//记录这个SKB关联的套接字,当在某一个套接字上收发一个packet时,与之相关的内存会得到分配
//(
socket packet buffer memory accounting??)
struct net_device *dev;
//具体的网络设备;
/* This is the control buffer. It is free to use for every
* layer. Please put your private variables there. If you
* want to keep them across layers you have to do a skb_clone()
* first. This is owned by whoever has the skb queued ATM.
*/
char cb[48] __aligned(8);
/* This is the control buffer. It is free to use for every
* layer. Please put your private variables there. If you
* want to keep them across layers you have to do a skb_clone()
* first. This is owned by whoever has the skb queued ATM.
*/
char cb[48] __aligned(8);
//SKB控制块,该透明存储区用来存每个packet的私有信息,如TCP可以用来放序列号和帧的重传状态;
unsigned long _skb_refdst;
#ifdef CONFIG_XFRM
struct sec_path *sp;
#endif
unsigned int len,
data_len;
unsigned long _skb_refdst;
#ifdef CONFIG_XFRM
struct sec_path *sp;
#endif
unsigned int len,
data_len;
//SKB是由一个线性缓冲区和可选的页缓存构成,len是packet的总长度,data_len是页缓存中字节长度;
//所以线性缓存中数据长度: skb->len - skb->data_len ,有函数 skb_headlen(skb)实现;
__u16 mac_len,
hdr_len;
//mac_len通常没有必要维护,除非为了实现IP tunnel的IPSEC解分装过程;这个字段初次初始化是在函数
//'netif_receive_skb()'中通过 skb->nh.raw - skb->mac.raw 得到的。
union {
__wsum csum;
struct {
__u16 csum_start;
__u16 csum_offset;
};
};
//校验和字段,如果网络设备具备计算csum功能的话,我们可以忽略;
__u32 priority; //QoS优先级;
kmemcheck_bitfield_begin(flags1);
__u8 local_df:1,
cloned:1,
ip_summed:2,
nohdr:1,
nfctinfo:3;
//local_df如果设置的话意味着我们可以在本地对已经分片的帧再次分片,这种情况在IPSEC中可能发生;
//在SKB克隆的时候,struct sk_buff会重新分配,但是数据区是共享的,cloned字段会在原SKB和克隆之后的结构中设置;
//ip_summed表征的是该设备提供的计算校验和的机制,这2代表的就是CHECKSUM_UNNECESSARY,这三个宏定义在skbuff.h中;
//NONE:设备为这个packet计算校验和失败skb_csum未定义;HW:这是最常见的方式,对所有的packet设备会提供校验和,在
//netif_rx中会设置到skb_csum中(注意即使设备仅支持有限的协议也必须使用HW而不是UNNECESSARY);
//UNNECESSARY通常发生在是环回接口的时候或者仅仅是一种通告而没有计算;
//nohdr字段是为了支持TSO,利用这种方式而不是改变TCP/IP头中的字段来表示硬件能够完成分段的功能,可以不为packet sniffers所见;
__u8 pkt_type:3,
fclone:2,
ipvs_property:1,
peeked:1,
nf_trace:1;
kmemcheck_bitfield_end(flags1);
//pkt_type包类型表示这个包发给谁,有PACKET_HOST,PACKET_BROADCAST,PACKET_MULTICAST,PACKET_OTHERHOST,
//PACKET_OUTGOING(这里的3代表发往的是其他主机)
__be16 protocol;
//表征即将进入的下一层的协议类型,该字段最初由例程 eth_type_trans()设置;
void (*destructor)(struct sk_buff *skb);
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
struct nf_conntrack *nfct;
#endif
#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED
struct sk_buff *nfct_reasm;
#endif
#ifdef CONFIG_BRIDGE_NETFILTER
struct nf_bridge_info *nf_bridge;
#endif
int skb_iif;
#ifdef CONFIG_NET_SCHED
__u16 tc_index; /* traffic control index */
#ifdef CONFIG_NET_CLS_ACT
__u16 tc_verd; /* traffic control verdict */
#endif
#endif
__u32 rxhash;
__u16 queue_mapping;
kmemcheck_bitfield_begin(flags2);
#ifdef CONFIG_IPV6_NDISC_NODETYPE
__u8 ndisc_nodetype:2;
#endif
__u8 ooo_okay:1;
__u8 l4_rxhash:1;
kmemcheck_bitfield_end(flags2);
/* 0/13 bit hole */
#ifdef CONFIG_NET_DMA
dma_cookie_t dma_cookie;
#endif
#ifdef CONFIG_NETWORK_SECMARK
__u32 secmark;
#endif
union {
__u32 mark;
__u32 dropcount;
};
__u16 vlan_tci; //vlan tag control information
#endif
#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED
struct sk_buff *nfct_reasm;
#endif
#ifdef CONFIG_BRIDGE_NETFILTER
struct nf_bridge_info *nf_bridge;
#endif
int skb_iif;
#ifdef CONFIG_NET_SCHED
__u16 tc_index; /* traffic control index */
#ifdef CONFIG_NET_CLS_ACT
__u16 tc_verd; /* traffic control verdict */
#endif
#endif
__u32 rxhash;
__u16 queue_mapping;
kmemcheck_bitfield_begin(flags2);
#ifdef CONFIG_IPV6_NDISC_NODETYPE
__u8 ndisc_nodetype:2;
#endif
__u8 ooo_okay:1;
__u8 l4_rxhash:1;
kmemcheck_bitfield_end(flags2);
/* 0/13 bit hole */
#ifdef CONFIG_NET_DMA
dma_cookie_t dma_cookie;
#endif
#ifdef CONFIG_NETWORK_SECMARK
__u32 secmark;
#endif
union {
__u32 mark;
__u32 dropcount;
};
__u16 vlan_tci;
sk_buff_data_t transport_header;
sk_buff_data_t network_header;
sk_buff_data_t mac_header;
void (*destructor)(struct sk_buff *skb);
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
struct nf_conntrack *nfct;
#endif
#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED
struct sk_buff *nfct_reasm;
#endif
#ifdef CONFIG_BRIDGE_NETFILTER
struct nf_bridge_info *nf_bridge;
#endif
int skb_iif;
#ifdef CONFIG_NET_SCHED
__u16 tc_index; /* traffic control index */
#ifdef CONFIG_NET_CLS_ACT
__u16 tc_verd; /* traffic control verdict */
#endif
#endif
__u32 rxhash;
__u16 queue_mapping;
kmemcheck_bitfield_begin(flags2);
#ifdef CONFIG_IPV6_NDISC_NODETYPE
__u8 ndisc_nodetype:2;
#endif
__u8 ooo_okay:1;
__u8 l4_rxhash:1;
kmemcheck_bitfield_end(flags2);
/* 0/13 bit hole */
#ifdef CONFIG_NET_DMA
dma_cookie_t dma_cookie;
#endif
#ifdef CONFIG_NETWORK_SECMARK
__u32 secmark;
#endif
union {
__u32 mark;
__u32 dropcount;
};
__u16 vlan_tci; //vlan tag control information
#endif
#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED
struct sk_buff *nfct_reasm;
#endif
#ifdef CONFIG_BRIDGE_NETFILTER
struct nf_bridge_info *nf_bridge;
#endif
int skb_iif;
#ifdef CONFIG_NET_SCHED
__u16 tc_index; /* traffic control index */
#ifdef CONFIG_NET_CLS_ACT
__u16 tc_verd; /* traffic control verdict */
#endif
#endif
__u32 rxhash;
__u16 queue_mapping;
kmemcheck_bitfield_begin(flags2);
#ifdef CONFIG_IPV6_NDISC_NODETYPE
__u8 ndisc_nodetype:2;
#endif
__u8 ooo_okay:1;
__u8 l4_rxhash:1;
kmemcheck_bitfield_end(flags2);
/* 0/13 bit hole */
#ifdef CONFIG_NET_DMA
dma_cookie_t dma_cookie;
#endif
#ifdef CONFIG_NETWORK_SECMARK
__u32 secmark;
#endif
union {
__u32 mark;
__u32 dropcount;
};
__u16 vlan_tci;
sk_buff_data_t transport_header;
sk_buff_data_t network_header;
sk_buff_data_t mac_header;
//传输层,网络层,链路层协议头,这三个字段在逐层解析packet的时候会设置; sk_buff_data_t依赖具体硬件
//如果sizeof(long)>4,那么定义为unsigned int 否则为 unsigned char *;
/* These elements must be at the end, see alloc_skb() for details. */
sk_buff_data_t tail;
sk_buff_data_t end;
unsigned char *head, *data;
//这四个域的存在是为了管理SKB的线性数据区;
unsigned int truesize;
//truesize标示这个packet所消耗的系统内存大小,包括skb结构体+数据缓冲区;
atomic_t users;
//引用计数,调用 skb_get()可以增加引用,反之 kfree_skb();
};
};
在skb->end之后的区域就是SKB的数据缓存,
struct skb_shared_info,基本的结构图如下:
* the end of the header data, ie. at skb->end.
*/
struct skb_shared_info {
unsigned short nr_frags;
//分段的数目,即是数据frags的大小;
unsigned short gso_size; //标示是否支持generic segmentation offload;
/* Warning: this field is not always filled in (UFO)! */
unsigned short gso_segs;
unsigned short gso_type;
__be32 ip6_frag_id;
__u8 tx_flags;
struct sk_buff *frag_list; //IP 分段列表,报文分段需要;
struct skb_shared_hwtstamps hwtstamps;
/*
* Warning : all fields before dataref are cleared in __alloc_skb()
*/
atomic_t dataref; //引用计数;
/* Intermediate layers must ensure that destructor_arg
* remains valid until skb destructor */
void * destructor_arg;
/* must be last field, see pskb_expand_head() */
skb_frag_t frags[MAX_SKB_FRAGS]; // 实际的数据页面;
};
unsigned short gso_size; //标示是否支持generic segmentation offload;
/* Warning: this field is not always filled in (UFO)! */
unsigned short gso_segs;
unsigned short gso_type;
__be32 ip6_frag_id;
__u8 tx_flags;
struct sk_buff *frag_list; //IP 分段列表,报文分段需要;
struct skb_shared_hwtstamps hwtstamps;
/*
* Warning : all fields before dataref are cleared in __alloc_skb()
*/
atomic_t dataref; //引用计数;
/* Intermediate layers must ensure that destructor_arg
* remains valid until skb destructor */
void * destructor_arg;
/* must be last field, see pskb_expand_head() */
skb_frag_t frags[MAX_SKB_FRAGS]; // 实际的数据页面;
};
typedef struct skb_frag_struct skb_frag_t;
struct skb_frag_struct {
struct {
struct page *p;
} page;
#if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536)
__u32 page_offset;
__u32 size;
#else
__u16 page_offset;
__u16 size;
#endif
};
struct skb_frag_struct {
struct {
struct page *p;
} page;
#if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536)
__u32 page_offset;
__u32 size;
#else
__u16 page_offset;
__u16 size;
#endif
};