struct sk_buff
一个封包就存储在这里,所有网络分层都会使用这个结构来储存其报文,有关用户数据的信息(有效载荷),以及用来协调其工作的其他内部信息。
这个结构体是Linux网络代中最重要的数据结构,代表已接收或者正要传输的数据包头,此结构定义在include/linux/skbuff.h头文件中,由巨大的变量堆(head)组成,试图满足所有人的需求。
在内核的进化历程进程中,这个结构历经多次变动,不但新增了选项,同时也重组了现存的字段,使得布局更为清晰,其字段可粗略划分为下列几个类型:
- 布局(Layout)
- 通用(General)
- 功能共用(Feature-specific)
- 管理函数(Management functions)
多个不同的网络分层(MAC或者L2分层上的另一个链路层协议,L3的IP以及L4的TCP或者UDP)都会使用这个结构,而且当该结构从一个分层传到另一个分层时,其不同的字段会随之发生变化,L4在传给L3之前会附加一个报头,而L3在传给L2之前又会加上其自己的报头,附加报头比起把数据从一个分层拷贝到另一个分层更有效率,由于要在一个缓冲区开端新增空间-——也就是要改变指向该缓冲区的变量-——是一种复杂的运算,内核提供skb_reserve函数来执行这一操作,所以,当缓冲区往下传经过每个分层,每层的协议首先要做的就是调用skb_reserve函数,为该协议报文预留空间。
当缓冲区往上经过各个网络分层时,每个源自于旧分层的报头就不再有用处,例如,L2报头只由处理L2协议的设备驱动程序使用,所以对L3而言并无用处,不过,并没有把L2的报头从缓存区删除,而是把指向有效载荷开端指针向前移动到L3报头的开端,这样就需要很少的CPU周期。
//include/linux/skbuff.h
/**
* struct sk_buff - socket buffer
* @next: Next buffer in list
* @prev: Previous buffer in list
* @tstamp: Time we arrived/left //收发包的时间戳
* @skb_mstamp_ns: (aka @tstamp) earliest departure time; start point
* for retransmit timer
* @rbnode: RB tree node, alternative to next/prev for netem/tcp
* @list: queue head
* @sk: Socket we are owned by
* @ip_defrag_offset: (aka @sk) alternate use of @sk, used in
* fragmentation management
* @dev: Device we arrived on/are leaving by
* @dev_scratch: (aka @dev) alternate use of @dev when @dev would be %NULL
* @cb: Control buffer. Free for use by every layer. Put private vars here
* @_skb_refdst: destination entry (with norefcount bit)
* @sp: the security path, used for xfrm
* @len: Length of actual data
* @data_len: Data length
* @mac_len: Length of link layer header
* @hdr_len: writable header length of cloned skb
* @csum: Checksum (must include start/offset pair)
* @csum_start: Offset from skb->head where checksumming should start
* @csum_offset: Offset from csum_start where checksum should be stored
* @priority: Packet queueing priority
* @ignore_df: allow local fragmentation
* @cloned: Head may be cloned (check refcnt to be sure)
* @ip_summed: Driver fed us an IP checksum
* @nohdr: Payload reference only, must not modify header
* @pkt_type: Packet class
* @fclone: skbuff clone status
* @ipvs_property: skbuff is owned by ipvs
* @inner_protocol_type: whether the inner protocol is
* ENCAP_TYPE_ETHER or ENCAP_TYPE_IPPROTO
* @remcsum_offload: remote checksum offload is enabled
* @offload_fwd_mark: Packet was L2-forwarded in hardware
* @offload_l3_fwd_mark: Packet was L3-forwarded in hardware
* @tc_skip_classify: do not classify packet. set by IFB device
* @tc_at_ingress: used within tc_classify to distinguish in/egress
* @redirected: packet was redirected by packet classifier
* @from_ingress: packet was redirected from the ingress path
* @peeked: this packet has been seen already, so stats have been
* done for it, don't do them again
* @nf_trace: netfilter packet trace flag
* @protocol: Packet protocol from driver
* @destructor: Destruct function
* @tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue)
* @_nfct: Associated connection, if any (with nfctinfo bits)
* @nf_bridge: Saved data about a bridged frame - see br_netfilter.c
* @skb_iif: ifindex of device we arrived on
* @tc_index: Traffic control index
* @hash: the packet hash
* @queue_mapping: Queue mapping for multiqueue devices
* @head_frag: skb was allocated from page fragments,
* not allocated by kmalloc() or vmalloc().
* @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves
* @active_extensions: active extensions (skb_ext_id types)
* @ndisc_nodetype: router type (from link layer)
* @ooo_okay: allow the mapping of a socket to a queue to be changed
* @l4_hash: indicate hash is a canonical 4-tuple hash over transport
* ports.
* @sw_hash: indicates hash was computed in software stack
* @wifi_acked_valid: wifi_acked was set
* @wifi_acked: whether frame was acked on wifi or not
* @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS
* @encapsulation: indicates the inner headers in the skbuff are valid
* @encap_hdr_csum: software checksum is needed
* @csum_valid: checksum is already valid
* @csum_not_inet: use CRC32c to resolve CHECKSUM_PARTIAL
* @csum_complete_sw: checksum was completed by software
* @csum_level: indicates the number of consecutive checksums found in
* the packet minus one that have been verified as
* CHECKSUM_UNNECESSARY (max 3)
* @dst_pending_confirm: need to confirm neighbour
* @decrypted: Decrypted SKB
* @napi_id: id of the NAPI struct this skb came from
* @sender_cpu: (aka @napi_id) source CPU in XPS
* @secmark: security marking
* @mark: Generic packet mark
* @reserved_tailroom: (aka @mark) number of bytes of free space available
* at the tail of an sk_buff
* @vlan_present: VLAN tag is present
* @vlan_proto: vlan encapsulation protocol
* @vlan_tci: vlan tag control information
* @inner_protocol: Protocol (encapsulation)
* @inner_ipproto: (aka @inner_protocol) stores ipproto when
* skb->inner_protocol_type == ENCAP_TYPE_IPPROTO;
* @inner_transport_header: Inner transport layer header (encapsulation)
* @inner_network_header: Network layer header (encapsulation)
* @inner_mac_header: Link layer header (encapsulation)
* @transport_header: Transport layer header
* @network_header: Network layer header
* @mac_header: Link layer header
* @kcov_handle: KCOV remote handle for remote coverage collection
* @tail: Tail pointer
* @end: End pointer
* @head: Head of buffer
* @data: Data head pointer
* @truesize: Buffer size
* @users: User count - see {datagram,tcp}.c
* @extensions: allocated extensions, valid if active_extensions is nonzero
*/
struct sk_buff {
union {
struct {
/* These two members must be first. */
struct sk_buff *next; //sk_buff链表中的下一个sock缓冲区
struct sk_buff *prev; //sk_buff链表中的前一个sock缓冲区
union {
struct net_device *dev; //接收到此网络报文的网络设备
/* Some protocols might use this space to store information,
* while device pointer would be NULL.
* UDP receive path is one user.
*/
unsigned long dev_scratch;
};
};
struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */ //分片
struct list_head list; //内核链表结构,用于快速定位链表头sk_buff_head
};
union {
struct sock *sk; //网络报文所属的sock结构,此值仅在本机发出的报文中有效,从网络收到的报文此值为空
int ip_defrag_offset; //用于分片管理中
};
union {
ktime_t tstamp; //收到此报文的时间戳
u64 skb_mstamp_ns; /* earliest departure time */ //最早出发时间
};
/*
* This is the control buffer. It is free to use for every
* layer. Please put your private variables there. If you
* want to keep them across layers you have to do a skb_clone()
* first. This is owned by whoever has the skb queued ATM.
*/
char cb[48] __aligned(8); //控制缓冲区,用于存储私有信息,每层协议自己维护并使用,并且只在本层有效
union {
struct {
unsigned long _skb_refdst; //路由缓存,输入或者输出报文都要查询到目的路由缓存项,才能确定流向
void (*destructor)(struct sk_buff *skb);
};
struct list_head tcp_tsorted_anchor;
};
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
unsigned long _nfct;
#endif
unsigned int len, //报文的总长度(主缓存区+片段(比如各种头))
data_len; //片段包含的全部报文长度
__u16 mac_len, //mac报头大小
hdr_len; //cloned skb的可写报文头的长度
/* Following fields are _not_ copied in __copy_skb_header()
* Note that queue_mapping is here mostly to fill a hole.
*/
__u16 queue_mapping;
/* if you move cloned around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define CLONED_MASK (1 << 7)
#else
#define CLONED_MASK 1
#endif
#define CLONED_OFFSET() offsetof(struct sk_buff, __cloned_offset)
/* private: */
__u8 __cloned_offset[0];
/* public: */
__u8 cloned:1, //该skb是cloned
nohdr:1, //payload是否被单独引用,不存在协议首部,如果被引用,则不能修改协议协议首部,也不能通过skb->data来访问协议首部
fclone:2, //当前克隆状态
peeked:1,
head_frag:1,
pfmemalloc:1;
#ifdef CONFIG_SKB_EXTENSIONS
__u8 active_extensions;
#endif
/* fields enclosed in headers_start/headers_end are copied
* using a single memcpy() in __copy_skb_header()
*/
/* private: */
__u32 headers_start[0];
/* public: */
/* if you move pkt_type around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define PKT_TYPE_MAX (7 << 5)
#else
#define PKT_TYPE_MAX 7
#endif
#define PKT_TYPE_OFFSET() offsetof(struct sk_buff, __pkt_type_offset)
/* private: */
__u8 __pkt_type_offset[0];
/* public: */
__u8 pkt_type:3; //包类别
__u8 ignore_df:1;
__u8 nf_trace:1;
__u8 ip_summed:2;
__u8 ooo_okay:1;
__u8 l4_hash:1;
__u8 sw_hash:1;
__u8 wifi_acked_valid:1;
__u8 wifi_acked:1;
__u8 no_fcs:1;
/* Indicates the inner headers are valid in the skbuff. */
__u8 encapsulation:1;
__u8 encap_hdr_csum:1;
__u8 csum_valid:1;
#ifdef __BIG_ENDIAN_BITFIELD
#define PKT_VLAN_PRESENT_BIT 7
#else
#define PKT_VLAN_PRESENT_BIT 0
#endif
#define PKT_VLAN_PRESENT_OFFSET() offsetof(struct sk_buff, __pkt_vlan_present_offset)
/* private: */
__u8 __pkt_vlan_present_offset[0];
/* public: */
__u8 vlan_present:1;
__u8 csum_complete_sw:1;
__u8 csum_level:2;
__u8 csum_not_inet:1;
__u8 dst_pending_confirm:1;
#ifdef CONFIG_IPV6_NDISC_NODETYPE
__u8 ndisc_nodetype:2;
#endif
__u8 ipvs_property:1;
__u8 inner_protocol_type:1;
__u8 remcsum_offload:1;
#ifdef CONFIG_NET_SWITCHDEV
__u8 offload_fwd_mark:1;
__u8 offload_l3_fwd_mark:1;
#endif
#ifdef CONFIG_NET_CLS_ACT
__u8 tc_skip_classify:1;
__u8 tc_at_ingress:1;
#endif
#ifdef CONFIG_NET_REDIRECT
__u8 redirected:1;
__u8 from_ingress:1;
#endif
#ifdef CONFIG_TLS_DEVICE
__u8 decrypted:1;
#endif
//make menuconfig->Networkong support->Networking options->QoS and/or fair queueing->....(选中添加到skb_buff,一般任何改变内核数据结构的选项,都不适合编译成一个模块,在kconfig文件中寻找)
//packet classifier:包分类器
#ifdef CONFIG_NET_SCHED
__u16 tc_index; /* traffic control index */
#endif
union { //校验和
__wsum csum;
struct {
__u16 csum_start; //校验开始的地方(当开始计算校验和时从skb->head的偏移)
__u16 csum_offset; //校验存放的地方(相对于start)(从csum_start开始的偏移)
};
};
__u32 priority; //数据包的优先与tos配合
int skb_iif;
__u32 hash;
__be16 vlan_proto;
__u16 vlan_tci;
#if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS)
union {
unsigned int napi_id;
unsigned int sender_cpu;
};
#endif
#ifdef CONFIG_NETWORK_SECMARK
__u32 secmark;
#endif
union {
__u32 mark;
__u32 reserved_tailroom;
};
//封装协议
union {
__be16 inner_protocol;
__u8 inner_ipproto;
};
__u16 inner_transport_header; //封装的传输层协议头
__u16 inner_network_header;
__u16 inner_mac_header;
__be16 protocol; //协议
__u16 transport_header;//传输层的协议头
__u16 network_header; //网络层的协议头
__u16 mac_header; //链路层的协议头
#ifdef CONFIG_KCOV
u64 kcov_handle;
#endif
/* private: */
__u32 headers_end[0];
/* public: */
/* These elements must be at the end, see alloc_skb() for details. */
sk_buff_data_t tail; //指向线性数据区最后一个字节的数据
sk_buff_data_t end; //指向线性数据区结尾
unsigned char *head, //线性数据区的开始
*data; //线性数据区的开始
unsigned int truesize; //该缓冲区所分配的总内存,包括sk_buff结构大小+len
refcount_t users; //引用计数
#ifdef CONFIG_SKB_EXTENSIONS
/* only useable after checking ->active_extensions != 0 */
struct skb_ext *extensions;
#endif
};
布局字段
sk_buff_head(布局字段)
skb_buff有些字段只是为了方便搜寻以及组织数据结构本身。内核在一个双向链表中维护所有的sk_buff结构,但是该表的组织比传统的双向链表更为复杂。
像任何双链表一样,通过每个sk_buff结构中的next和prev字段实现联系,next字段指向前,而prev指向后,但是这个表还需要另一个必须需求,每个sk_buff结构必须能够迅速找出整个表的头,为了实现这项必要需求,在表的开端额外增加一个sk_buff_head结构作为一个哑元素。
struct sk_buff_head {
/* These two members must be first. */
//next与prev两个元素与sk_buff的两个元素相同
struct sk_buff *next;
struct sk_buff *prev;
__u32 qlen; //代表表中元素的数目
spinlock_t lock; //用于防止对表的并发访问
};
sk_buff_head与sk_buff相比实在太小,但是还是允许两个结构共同存在于同一个表中,另外,同样的函数也可用于操作sk_buff_head和sk_buff两者。
struct sock *sk
这是一个指针,指向拥有此缓冲区的套接字的sock数据结构。当数据在本地产生或者正由本地进程接收时,就需要这个指针,因为该数据以及套接字相关的信息会由L4(TCP或UDP)以及用户程序使用,当缓存区只是被转发时(也就是说本地机器不是来源地也不是目的地),该指针就是NULL。
unsigned int len
这些指向缓冲区中数据区块的大小,这个长度包括主要缓冲区(由head所指)的数据以及一些片段的数据,当缓冲区从一个网络分层往下一个网络分层时,其值就会变化,因为协议栈中往上时会被丢弃,但是往下移动时添加报头就会添加进来,len也会把协议报头算在内。
unsigned int data_len
与len不同,它只计算片段中的数据大小
__u16 mac_len
MAC报文的大小
refcount_t users;
这是引用计数,或者使用这个sk_buff缓冲区的实例的数目。这个参数的主要用途是避免在某人依然使用此sk_buff结构时,把这个结构释放掉,因此,此缓冲区的每个用户在必要时都要递增和递减字段,此计算器只计算sk_buff数据结构的用户,此缓冲区所包含的实际数据由一个相似的字段(dataref)所包含。
一般在调用skb_get和kfree_skb进行处理。
unsigned int truesize
此字段代表此缓冲区总的大小,包括sk_buff结构本身,当此缓冲区所分配的len个字节的数据请求空间时,此字段的初始化由alloc_skb函数设置成len+sizeof(sk_buff)。
skb->truesize = size + sizeof(struct sk_buff);
当skb->len的值增加时,此字段就会得到更新。
sk_buff_data_t tail; sk_buff_data_t end;unsigned char *head,*data;
这些缓冲区的边界以及其中的数据。当每一个分层为其工作而准备缓冲区时,可能会为一个报文或者更多的数据分配更多的空间。head和end指向已分配缓冲区空间的开端和尾端,而data和tail则指向实际数据的开端和尾端,该分层可以把head和data之间的空隙上一个协议报头,或者以新数据填入tail和end之间的间隙。
void (*destructor)(struct sk_buff *skb)
此函数指针可以被初始化为一个函数,当此缓冲区被删除时,可完成某些工作,当此缓冲区不属于一个套接字时,destructor通常不会被初始化,当缓冲区属于一个套接字,通常设置成sock_rfree或者sock_wfree(分别由skb_set_owner_r和skb_set_owner_w初始化函数设置),这两个sock_xxxx函数可用于更新套接字队列中所持有的内存。
通用字段
ktime_t tstamp
struct net_device *dev
功能专用字段
unsigned long _nfct
管理函数
skb_reserve
skb_reserve只能操作空skb,即在分配了空间,尚未填充数据时调用
/**
* skb_reserve - adjust headroom //调整空间的大小
* @skb: buffer to alter
* @len: bytes to move
*
* Increase the headroom of an empty &sk_buff by reducing the tail
* room. This is only allowed for an empty buffer.
*/
//保留头部空间,只能对空的skb使用
static inline void skb_reserve(struct sk_buff *skb, int len)
{
skb->data += len; //偏移数据长度 ///* 数据区data指针增加len字节*/
skb->tail += len; ///* 数据区tail指针增加len字节 */
}
skb_put
/**
* skb_put - add data to a buffer
* @skb: buffer to use
* @len: amount of data to add
*
* This function extends the used data area of the buffer. If this would
* exceed the total buffer size the kernel will panic. A pointer to the
* first byte of the extra data is returned.
*/
//向skb尾部添加数据
void *skb_put(struct sk_buff *skb, unsigned int len)
{
//获取当前skb->tail
void *tmp = skb_tail_pointer(skb);
//要求skb数据区必须为线性
SKB_LINEAR_ASSERT(skb);
//skb尾部增加len字节
skb->tail += len;
//skb数据总长度增加len字节
skb->len += len;
//如果增加之后的tail>end,则panic
if (unlikely(skb->tail > skb->end))
skb_over_panic(skb, len, __builtin_return_address(0));
//返回添加数据的第一个字节位置
return tmp;
}
skb_push
/**
* skb_push - add data to the start of a buffer
* @skb: buffer to use
* @len: amount of data to add
*
* This function extends the used data area of the buffer at the buffer
* start. If this would exceed the total buffer headroom the kernel will
* panic. A pointer to the first byte of the extra data is returned.
*/
//向skb数据区头部添加数据
void *skb_push(struct sk_buff *skb, unsigned int len)
{
//数据区data指针前移len字节
skb->data -= len;
//数据总长度增加len字节
skb->len += len;
//添加数据长度溢出过header,panic
if (unlikely(skb->data < skb->head))
skb_under_panic(skb, len, __builtin_return_address(0));
//返回新的data指针
return skb->data;
}
/**
* skb_pull - remove data from the start of a buffer
* @skb: buffer to use
* @len: amount of data to remove
*
* This function removes data from the start of a buffer, returning
* the memory to the headroom. A pointer to the next data in the buffer
* is returned. Once the data has been pulled future pushes will overwrite
* the old data.
*/
//从数据区头部移处数据
void *skb_pull(struct sk_buff *skb, unsigned int len)
{
return skb_pull_inline(skb, len);
}
//根据移数据长度判断函数调用
static inline void *skb_pull_inline(struct sk_buff *skb, unsigned int len)
{
/*
移除长度>skb数据总长度,返回NULL
否则,继续调用__skb_pull函数
*/
return unlikely(len > skb->len) ? NULL : __skb_pull(skb, len);
}
//从skb数据区头部移除数据
static inline void *__skb_pull(struct sk_buff *skb, unsigned int len)
{
//数据总长度减去len字节
skb->len -= len;
//数据总长度是否有异常
BUG_ON(skb->len < skb->data_len);
/*
data指针移动len字节
返回移动除之后新的data指针
*/
return skb->data += len;
}
SKB的操作函数
dev_kfree_skb
释放skb:kfree_skb和dev_kfree_skb
两个函数会释放一个缓冲区,使其返回缓冲池(缓存)。kfree_skb是直接由dev_kfree_skb调用并启动的。只有当skb->users计数器为1时(该缓冲区已无任何用户时),这个基本函数才会释放一个缓冲区。否则,只是递减该计数器。在sk_buff底端的skb_shared_info数据结构可以持有一些指向其他内存片段的指针。kfree_skb也会释放这些片段所持有的内存。
#define dev_kfree_skb(a) consume_skb(a)
static inline void consume_skb(struct sk_buff *skb)
{
return kfree_skb(skb);
}
/**
* kfree_skb - free an sk_buff
* @skb: buffer to free
*
* Drop a reference to the buffer and free it if the usage count has
* hit zero.
*/
void kfree_skb(struct sk_buff *skb)
{
if (!skb_unref(skb))
return;
//静态插装点函数
trace_kfree_skb(skb, __builtin_return_address(0));
__kfree_skb(skb);
}
/**
* __kfree_skb - private function
* @skb: buffer
*
* Free an sk_buff. Release anything attached to the buffer.
* Clean the state. This is an internal helper function. Users should
* always call kfree_skb
*/
//很复杂,单独开章节说明
void __kfree_skb(struct sk_buff *skb)
{
skb_release_all(skb);
kfree_skbmem(skb);
}
skb_trim
skb_trim从缓冲区尾部移走长度为len的数据,即skb->tail指针向前移动skb->tail-len。
/**
* skb_trim - remove end from a buffer
* @skb: buffer to alter
* @len: new length
*
* Cut the length of a buffer down by removing data from the tail. If
* the buffer is already under the length specified it is not modified.
* The skb must be linear.
*/
void skb_trim(struct sk_buff *skb, unsigned int len)
{
if (skb->len > len)
__skb_trim(skb, len);
}
static inline void __skb_trim(struct sk_buff *skb, unsigned int len)
{
__skb_set_length(skb, len);
}
static inline void __skb_set_length(struct sk_buff *skb, unsigned int len)
{
if (WARN_ON(skb_is_nonlinear(skb)))
return;
skb->len = len;
skb_set_tail_pointer(skb, len);
}
skb_shared_info
数据缓冲区尾端有个名为skb_shared_info的数据结构,用以保持此数据区块的附加信息。此数据结构紧接在标记数据尾端的end指针之后。
/* This data is invariant across clones and lives at
* the end of the header data, ie. at skb->end.
*/
struct skb_shared_info {
__u8 __unused;
__u8 meta_len;
__u8 nr_frags; //用于处理IP片段
__u8 tx_flags; //用于处理IP片段
unsigned short gso_size;
/* Warning: this field is not always filled in (UFO)! */
unsigned short gso_segs;
struct sk_buff *frag_list; //用于处理IP片段
struct skb_shared_hwtstamps hwtstamps;
unsigned int gso_type;
u32 tskey;
/*
* Warning : all fields before dataref are cleared in __alloc_skb()
*/
atomic_t dataref; //数据块的"用户"数目
/* Intermediate layers must ensure that destructor_arg
* remains valid until skb destructor */
void * destructor_arg;
/* must be last field, see pskb_expand_head() */
skb_frag_t frags[MAX_SKB_FRAGS];
};
sk_buff结构中没有指向skb_shared_info数据结构的字段。为了访问该结构体,函数必须使用返回end指针的skb_shinfo宏:
#define skb_shinfo(SKB) ((struct skb_shared_info *)(skb_end_pointer(SKB)))
通过这个指针去访问里面的成员
u32 nr_frags = skb_shinfo(skb)->nr_frags + 1
head和end分别指向存放数据内存区域的头和尾,一旦分配就固定不变。
data和tail分别是真正数据的起始位结束。
head和data之间的区域成为headroom,data和tail之间的区域存放真正的数据,tail和end之间的区域成为tailroom。skb刚分配时,head,data和tail在同一位置,end在末尾,所以刚开始时,headroom大小为0,tailroom大小为size,后续对数据包的操作,通过移动data和tail完成,head和end固定不变。
重要的长度len的解析
这里要声明两个概念的区别,后续直接用这两个概念,注意区分:
(1)线性数据:head - end。
(2)实际线性数据:data - tail,不包含线性数据中的头空间和尾空间。
skb->data_len: skb中的分片数据(非线性数据)的长度。
skb->len: skb中的数据块的总长度,数据块包括实际线性数据和非线性数据,非线性数据为data_len,所以skb->len= (data - tail) + data_len。
skb->truesize: skb的总长度,包括sk_buff结构和数据部分,skb=sk_buff控制信息 + 线性数据(包括头空间和尾空间) + skb_shared_info控制信息 + 非线性数据,所以skb->truesize = sizeof(struct sk_buff) + (head - end) + sizeof(struct skb_shared_info) + data_len。
skb_clone 与skb_copy
skb_copy是一个深拷贝,skb_clone只是一个浅拷贝
1、skb_clone()
Skb_clone()函数只是复制sk_buff结构,并不复制skb的数据缓冲区。Clone后的sk_buff结构与原始的sk_buff指向同一数据缓冲区。原始的和clone后的skb描述符的cloned值都会被置1,clone的skb描述符的users值置1,同时数据缓冲区的引用计数dataref增加1。
特别说明,skb_clone()函数复制的只是skb描述符,而复制后的skb与原始skb指向的是同一数据缓冲区,由于数据缓冲区并未加什么同步锁机制,因此skb_clone()操作的skb结构的数据缓冲区是不能被修改的。
2、skb_copy是对skb的数据完整复制
/**
* skb_clone - duplicate an sk_buff
* @skb: buffer to clone
* @gfp_mask: allocation priority
*
* Duplicate an &sk_buff. The new one is not owned by a socket. Both
* copies share the same packet data but not structure. The new
* buffer has a reference count of 1. If the allocation fails the
* function returns %NULL otherwise the new buffer is returned.
*
* If this function is called from an interrupt gfp_mask() must be
* %GFP_ATOMIC.
*/
struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
{
struct sk_buff_fclones *fclones = container_of(skb,
struct sk_buff_fclones,
skb1);
struct sk_buff *n;
if (skb_orphan_frags(skb, gfp_mask))
return NULL;
if (skb->fclone == SKB_FCLONE_ORIG &&
refcount_read(&fclones->fclone_ref) == 1) {
n = &fclones->skb2;
refcount_set(&fclones->fclone_ref, 2);
} else {
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
if (!n)
return NULL;
n->fclone = SKB_FCLONE_UNAVAILABLE;
}
return __skb_clone(n, skb);
}
/**
* skb_copy - create private copy of an sk_buff
* @skb: buffer to copy
* @gfp_mask: allocation priority
*
* Make a copy of both an &sk_buff and its data. This is used when the
* caller wishes to modify the data and needs a private copy of the
* data to alter. Returns %NULL on failure or the pointer to the buffer
* on success. The returned buffer has a reference count of 1.
*
* As by-product this function converts non-linear &sk_buff to linear
* one, so that &sk_buff becomes completely private and caller is allowed
* to modify all the data of returned buffer. This means that this
* function is not recommended for use in circumstances when only
* header is going to be modified. Use pskb_copy() instead.
*/
struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
{
int headerlen = skb_headroom(skb);
unsigned int size = skb_end_offset(skb) + skb->data_len;
struct sk_buff *n = __alloc_skb(size, gfp_mask,
skb_alloc_rx_flag(skb), NUMA_NO_NODE);
if (!n)
return NULL;
/* Set the data pointer */
skb_reserve(n, headerlen);
/* Set the tail pointer and length */
skb_put(n, skb->len);
BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len));
skb_copy_header(n, skb);
return n;
}
例子
static struct sock *__udp4_lib_err_encap(struct net *net,
const struct iphdr *iph,
struct udphdr *uh,
struct udp_table *udptable,
struct sk_buff *skb, u32 info){
....................
network_offset = skb_network_offset(skb);
transport_offset = skb_transport_offset(skb);
/* Network header needs to point to the outer IPv4 header inside ICMP */
skb_reset_network_header(skb);
/* Transport header needs to point to the UDP header */
skb_set_transport_header(skb, iph->ihl << 2);
..............................
}