0x01 缘由
从大学开始算法题开始,强调数据结构的重要性,良好的数据结构设计会使程序设计更加合理和健壮。
0x02 关键结构
先摘录一个图,了解各数据结构件的关系。--摘录于《Linux TCP IP 协议栈分析.pdf》
1.struct sk_buff - socket buffer
struct sk_buff {
/* 这两个结构必须放在此结构的前面,主要方便数据的强制转换*/
struct sk_buff *next; //双向链表结构,指向下一个sk_buff
struct sk_buff *prev; //指向前一个sk_buff结构
struct sock *sk; //这个指针指向一个套接字sock数据结构。当数据在本地产生或者本地进程接受时,需要这个指针;里面的数据会有tcp/udp和用户态程序使用。如果是转发此指针为NULL。后续详解。
ktime_t tstamp; //包到达的时间戳
struct net_device *dev; //网络设备,哪个网卡、虚拟网卡,后续结构详解。
unsigned long _skb_dst;
/*
* 这是控制缓冲区。 每层都可以自由使用 请把您的私有变量放在那里。 如果你想让他们跨层,你必须先做一个skb_clone()。 这是由谁拥有skb排队的ATM拥有。
*/
char cb[48];
unsigned int len, //数据包的全部数据长度,包括data指向的数据和end后面的分片的数据的总长
data_len; //本分片所包含的数据长度
__u16 mac_len, //mac包头长度
hdr_len; //硬件头部长度
union {
__wsum csum;
struct {
__u16 csum_start;
__u16 csum_offset;
};
}; //校验和
__u32 priority; //QoS等级
kmemcheck_bitfield_begin(flags1);
__u8 local_df:1,
cloned:1,
ip_summed:2,
nohdr:1,
nfctinfo:3;
__u8 pkt_type:3, // 根据L2层帧的目的地址进行类型划分。
fclone:2, //sk_buff克隆状态
ipvs_property:1, //IP虚拟服务器属性
peeked:1, //这个数据包已经被看到了,所以已经做了统计,不要再做了
nf_trace:1; //netfilter 包记录标识
__be16 protocol:16; //从L2层设备驱动看使用在下一个较高层的协议。
kmemcheck_bitfield_end(flags1);
void (*destructor)(struct sk_buff *skb); //析构函数
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
struct nf_conntrack *nfct; skb与连接的关系
struct sk_buff *nfct_reasm; netfilter conntrack重组指针
#endif
#ifdef CONFIG_BRIDGE_NETFILTER
struct nf_bridge_info *nf_bridge;//桥接帧数据
#endif
int iif; //到达的设备的索引,网卡索引编号
#ifdef CONFIG_NET_SCHED
__u16 tc_index; //流量控制索引
#ifdef CONFIG_NET_CLS_ACT
__u16 tc_verd; //流量控制决定
#endif
#endif
kmemcheck_bitfield_begin(flags2);
__u16 queue_mapping:16; //多队列网卡设备的映射关系
#ifdef CONFIG_IPV6_NDISC_NODETYPE
__u8 ndisc_nodetype:2; //路由类型,来自链路层;
#endif
kmemcheck_bitfield_end(flags2);
/* 0/14 bit hole */
#ifdef CONFIG_NET_DMA
dma_cookie_t dma_cookie; //被DMA相关函数完成的相关操作cookeie
#endif
#ifdef CONFIG_NETWORK_SECMARK
__u32 secmark; //安全相关标记
#endif
__u32 mark; //通用标记
__u16 vlan_tci; //vlan标签控制信息
sk_buff_data_t transport_header; //传输层头
sk_buff_data_t network_header; //网络层头
sk_buff_data_t mac_header; //以太网层头
/* These elements must be at the end, see alloc_skb() for details. */
sk_buff_data_t tail;
sk_buff_data_t end;
unsigned char *head,
*data; //head和end指向的是数据区的开端和尾、,data和tail指向的是实际数据的开头和结尾
unsigned int truesize; //此缓冲区总大小,包括sk_buff。sk_buff只不过是个指针的集合,他所指的才是真正的数据区,所以是两部分。
atomic_t users; //引用计数,使用这个sk_buff的使用者的数目,可能有多个函数要使用同一个sk_buff所以防止提前释放掉,设置此计数
};
2.struct sock - scokets的网络层描述
struct sock {
sock_common __sk_common; //套接口在网络层的最小表示
#define sk_node __sk_common.skc_node
#define sk_nulls_node __sk_common.skc_nulls_node
#define sk_refcnt __sk_common.skc_refcnt
#define sk_copy_start __sk_common.skc_hash
#define sk_hash __sk_common.skc_hash
#define sk_family __sk_common.skc_family
#define sk_state __sk_common.skc_state
#define sk_reuse __sk_common.skc_reuse
#define sk_bound_dev_if __sk_common.skc_bound_dev_if
#define sk_bind_node __sk_common.skc_bind_node
#define sk_prot __sk_common.skc_prot
#define sk_net __sk_common.skc_net
kmemcheck_bitfield_begin(flags);
unsigned int sk_shutdown : 2, //是一组标志位,SEND_SHUTDOWN and/or RCV_SHUTDOWN。
sk_no_check : 2, //不对包进行检查标识
sk_userlocks : 4, // %SO_SNDBUF 和 %SO_RCVBUF 缓存设置锁
sk_protocol : 8,
sk_type : 16;
kmemcheck_bitfield_end(flags);
int sk_rcvbuf; //接收缓存区大小
socket_lock_t sk_lock; //同步锁
/*
* The backlog queue is special, it is always used with
* the per-socket spinlock held and requires low latency
* access. Therefore we special case it's implementation.
*/
struct {
struct sk_buff *head;
struct sk_buff *tail;
} sk_backlog; //总是被自旋锁持有
wait_queue_head_t *sk_sleep; //在队列中等待的socket
struct dst_entry *sk_dst_cache; //目的地址的缓存
#ifdef CONFIG_XFRM
struct xfrm_policy *sk_policy[2];
#endif
rwlock_t sk_dst_lock; //目的缓存读写锁
atomic_t sk_rmem_alloc; //表示接收队列已提交的字节数。
atomic_t sk_wmem_alloc; //表示发送队列已提交的字节数。
atomic_t sk_omem_alloc; //用“O”或“other”做选项
int sk_sndbuf;
struct sk_buff_head sk_receive_queue; //表示接收的数据包的队列。
struct sk_buff_head sk_write_queue; //表示发送的数据包的队列。
#ifdef CONFIG_NET_DMA
struct sk_buff_head sk_async_wait_queue; //DMA复制数据包
#endif
int sk_wmem_queued; //维持的队列大小
int sk_forward_alloc; //转发空间分配
gfp_t sk_allocation; //分配空间的模式
int sk_route_caps; //路由容量
int sk_gso_type; //GSO type (e.g. %SKB_GSO_TCPV4)
unsigned int sk_gso_max_size;//最大的GSO段大小
int sk_rcvlowat; //
unsigned long sk_flags; //%SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, %SO_OOBINLINE 标识设定, %SO_TIMESTAMPING 标识设定
unsigned long sk_lingertime; //SO_LINGER设定
struct sk_buff_head sk_error_queue; //非常少用
struct proto *sk_prot_creator; //原始socket创建器
rwlock_t sk_callback_lock; //
int sk_err,
sk_err_soft;
atomic_t sk_drops; //raw/udp drop计数
unsigned short sk_ack_backlog; //当前监听队列数
unsigned short sk_max_ack_backlog; //在listen()中设置的数目
__u32 sk_priority; //优先级
struct ucred sk_peercred;
long sk_rcvtimeo;
long sk_sndtimeo;
struct sk_filter *sk_filter; //socket 过滤结构
void *sk_protinfo; //私有区域
struct timer_list sk_timer; //socket清理定时器
ktime_t sk_stamp; //最后一包接收时间
struct socket *sk_socket; //IO信号
void *sk_user_data; //RPC层私有数据
struct page *sk_sndmsg_page; //sndmsg缓存
struct sk_buff *sk_send_head; //转发数据头
__u32 sk_sndmsg_off; // sndmsg缓存偏移
int sk_write_pending;
#ifdef CONFIG_SECURITY
void *sk_security;
#endif
__u32 sk_mark;
/* XXX 4 bytes hole on 64 bit */
void (*sk_state_change)(struct sock *sk);
void (*sk_data_ready)(struct sock *sk, int bytes);
void (*sk_write_space)(struct sock *sk);
void (*sk_error_report)(struct sock *sk);
int (*sk_backlog_rcv)(struct sock *sk,
struct sk_buff *skb);
void (*sk_destruct)(struct sock *sk);
};
struct sock_common {
unsigned short skc_family; /*地址族*/
volatile unsigned char skc_state; /*连接状态*/
unsigned char skc_reuse; /*SO_REUSEADDR设置*/
int skc_bound_dev_if;
struct hlist_node skc_node;
struct hlist_node skc_bind_node; /*哈希表相关*/
atomic_t skc_refcnt; /*引用计数*/
};
3.struct net_device
struct net_device
{
char name[IFNAMSIZ]; //网络设备名称,如eth0
struct hlist_node name_hlist; //这个字段用于构建网络设备名的哈希散列表,而struct net中的name_hlist就指向每个哈希散列表的链表头;
char *ifalias; //网络设备的别名;
/*网络设备内存映射时在主机中的内存区域*/
unsigned long mem_end; /* 共享内存结束 */
unsigned long mem_start; /* 共享内存开始 */
unsigned long base_addr; /* 网络设备I/O基地址 */
unsigned int irq; /* 设备终端号*/
unsigned char if_port; /* 传输介质,如双绞线、同轴电缆等,在多端口设备中指定使用哪个端口*/
unsigned char dma; /* DMA通道*/
unsigned long state; /* 网络设备物理上的工作状态 */
struct list_head dev_list; //网络设备链表
struct list_head napi_list;//支持NAPI传输的网络设备链表
/* Net device features */
unsigned long features; //设备硬件功能特性
#define NETIF_F_SG 1 /* Scatter/gather IO. */
#define NETIF_F_IP_CSUM 2 /* Can checksum TCP/UDP over IPv4. */
#define NETIF_F_NO_CSUM 4 /* Does not require checksum. F.e. loopack. */
#define NETIF_F_HW_CSUM 8 /* Can checksum all the packets. */
#define NETIF_F_IPV6_CSUM 16 /* Can checksum TCP/UDP over IPV6 */
#define NETIF_F_HIGHDMA 32 /* Can DMA to high memory. */
#define NETIF_F_FRAGLIST 64 /* Scatter/gather IO. */
#define NETIF_F_HW_VLAN_TX 128 /* Transmit VLAN hw acceleration */
#define NETIF_F_HW_VLAN_RX 256 /* Receive VLAN hw acceleration */
#define NETIF_F_HW_VLAN_FILTER 512 /* Receive filtering on VLAN */
#define NETIF_F_VLAN_CHALLENGED 1024 /* Device cannot handle VLAN packets */
#define NETIF_F_GSO 2048 /* Enable software GSO. */
#define NETIF_F_LLTX 4096 /* LockLess TX - deprecated. Please */
/* do not use LLTX in new drivers */
#define NETIF_F_NETNS_LOCAL 8192 /* Does not change network namespaces */
#define NETIF_F_GRO 16384 /* Generic receive offload */
#define NETIF_F_LRO 32768 /* large receive offload */
/* the GSO_MASK reserves bits 16 through 23 */
#define NETIF_F_FCOE_CRC (1 << 24) /* FCoE CRC32 */
#define NETIF_F_SCTP_CSUM (1 << 25) /* SCTP checksum offload */
#define NETIF_F_FCOE_MTU (1 << 26) /* Supports max FCoE MTU, 2158 bytes*/
/* Segmentation offload features */
#define NETIF_F_GSO_SHIFT 16
#define NETIF_F_GSO_MASK 0x00ff0000
#define NETIF_F_TSO (SKB_GSO_TCPV4 << NETIF_F_GSO_SHIFT)
#define NETIF_F_UFO (SKB_GSO_UDP << NETIF_F_GSO_SHIFT)
#define NETIF_F_GSO_ROBUST (SKB_GSO_DODGY << NETIF_F_GSO_SHIFT)
#define NETIF_F_TSO_ECN (SKB_GSO_TCP_ECN << NETIF_F_GSO_SHIFT)
#define NETIF_F_TSO6 (SKB_GSO_TCPV6 << NETIF_F_GSO_SHIFT)
#define NETIF_F_FSO (SKB_GSO_FCOE << NETIF_F_GSO_SHIFT)
/* List of features with software fallbacks. */
#define NETIF_F_GSO_SOFTWARE (NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6)
#define NETIF_F_GEN_CSUM (NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
#define NETIF_F_V4_CSUM (NETIF_F_GEN_CSUM | NETIF_F_IP_CSUM)
#define NETIF_F_V6_CSUM (NETIF_F_GEN_CSUM | NETIF_F_IPV6_CSUM)
#define NETIF_F_ALL_CSUM (NETIF_F_V4_CSUM | NETIF_F_V6_CSUM)
/*
* If one device supports one of these features, then enable them
* for all in netdev_increment_features.
*/
#define NETIF_F_ONE_FOR_ALL (NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ROBUST | \
NETIF_F_SG | NETIF_F_HIGHDMA | \
NETIF_F_FRAGLIST)
/* Interface index. Unique device identifier */
int ifindex;//标识网络设备的唯一索引号
int iflink;//用于虚拟网络设备
struct net_device_stats stats; //统计信息
#ifdef CONFIG_WIRELESS_EXT
/* List of functions to handle Wireless Extensions (instead of ioctl).
* See <net/iw_handler.h> for details. Jean II */
const struct iw_handler_def * wireless_handlers;
/* Instance data managed by the core of Wireless Extensions. */
struct iw_public_data * wireless_data;
#endif
/* Management operations */
const struct net_device_ops *netdev_ops;//网络设备驱动程序需要实现的一组操作函数
const struct ethtool_ops *ethtool_ops;//支持ethtool功能的一组操作函数
/* Hardware header description */
const struct header_ops *header_ops;//数据链路层协议头相关的一组操作函数
unsigned int flags; /* 它们的可能取值定义在linux-2.6.38.8/include/linux/if.h文件中。*/
unsigned short gflags;
unsigned short priv_flags; /* Like 'flags' but invisible to userspace. */
unsigned short padded; /* 分配net_device结构体及其私有数据时为对齐所需的填充位数目*/
unsigned char operstate; /*//RFC 2863操作状态 */
unsigned char link_mode; /* 映射到RFC2863兼容状态的策略 */
unsigned mtu; /* MTU */
unsigned short type; /* 网络设备硬件类型,如10Mbps以太网ARPHRD_ETHER */
unsigned short hard_header_len; /* 硬件数据帧头的长度,以太网为14字节 */
/* extra head- and tailroom the hardware may need, but not in all cases
* can this be guaranteed, especially tailroom. Some cases also use
* LL_MAX_HEADER instead to allocate the skb.
*/
unsigned short needed_headroom; //分配套接字缓冲区时预留空间的长度
unsigned short needed_tailroom;
struct net_device *master; /* 分组状态 */
/* 硬件(如MAC)地址长度以及设备的硬件地址 */
unsigned char perm_addr[MAX_ADDR_LEN]; /* permanent hw address */
unsigned char addr_len; /* hardware address length */
unsigned short dev_id; /* for shared network cards */
struct netdev_hw_addr_list uc; /* 网络设备硬件地址组成的链表 */
int uc_promisc; //混杂模式时的单播地址个数
spinlock_t addr_list_lock;//防止单播地址链表和组播地址链表被并发访问的自旋锁
struct dev_addr_list *mc_list; /* Multicast mac addresses */
int mc_count; /* Number of installed mcasts */
unsigned int promiscuity; //混杂模式的计数器
unsigned int allmulti;//监听所有组播地址
/* 网络层协议特定数据 */
#ifdef CONFIG_NET_DSA
void *dsa_ptr; /* dsa specific data */
#endif
void *atalk_ptr; /* AppleTalk link */
void *ip_ptr; /* IPv4 specific data */
void *dn_ptr; /* DECnet specific data */
void *ip6_ptr; /* IPv6 specific data */
void *ec_ptr; /* Econet specific data */
void *ax25_ptr; /* AX.25 specific data */
struct wireless_dev *ieee80211_ptr; /* IEEE 802.11 specific data,
assign before registering */
/*
* Cache line mostly used on receive path (including eth_type_trans())
*/
unsigned long last_rx; /* 最后接收数据包的时间 */
/* Interface address info used in eth_type_trans() */
unsigned char *dev_addr; /* hw address, (before bcast
because most packets are
unicast) */
struct netdev_hw_addr_list dev_addrs; /* //网络设备硬件地址组成的链表 */
unsigned char broadcast[MAX_ADDR_LEN]; /* //广播地址 */
struct netdev_queue rx_queue; //接收队列,RPS(Receive Packet Steering)特性
struct netdev_queue *_tx ____cacheline_aligned_in_smp;
/* Number of TX queues allocated at alloc_netdev_mq() time */
unsigned int num_tx_queues; //发送队列
/* Number of TX queues currently active in device */
unsigned int real_num_tx_queues;
/* root qdisc from userspace point of view */
struct Qdisc *qdisc;
unsigned long tx_queue_len; /* Max frames per queue allowed */
spinlock_t tx_global_lock;
/*
* One part is mostly used on xmit path (device)
*/
/* These may be needed for future network-power-down code. */
/*
* trans_start here is expensive for high speed devices on SMP,
* please use netdev_queue->trans_start instead.
*/
unsigned long trans_start; /* //最近传送数据包的时间 */
int watchdog_timeo; //发生传输超时时,设置的标志
struct timer_list watchdog_timer;//网络层设置的传送数据包超时的时钟
/* Number of references to this device */
atomic_t refcnt ____cacheline_aligned_in_smp;
/* delayed register/unregister */
struct list_head todo_list;//延迟注册/注销的网络设备链表
/* device index hash chain */
struct hlist_node index_hlist;//以索引号为关键字的网络设备哈希链表
struct net_device *link_watch_next;
/* register/unregister state machine */
enum { NETREG_UNINITIALIZED=0,
NETREG_REGISTERED, /* completed register_netdevice */
NETREG_UNREGISTERING, /* called unregister_netdevice */
NETREG_UNREGISTERED, /* completed unregister todo */
NETREG_RELEASED, /* called free_netdev */
NETREG_DUMMY, /* dummy device for NAPI poll */
} reg_state; //设备注册/注销状态机
/* Called from unregister, can be used to call free_netdev */
void (*destructor)(struct net_device *dev);
#ifdef CONFIG_NETPOLL
struct netpoll_info *npinfo;//NETPOLL相关信息
#endif
#ifdef CONFIG_NET_NS
/* Network namespace this network device is inside */
struct net *nd_net; //网络命名空间
#endif
/* mid-layer private */
void *ml_priv; //中间层的私有数据
/* bridge stuff */
struct net_bridge_port *br_port; //桥接模式
/* macvlan */
struct macvlan_port *macvlan_port;
/* GARP */
struct garp_port *garp_port;
/* class/net/name entry */
struct device dev; //在sysfs文件系统中输出网络设备信息
/* space for optional statistics and wireless sysfs groups */
const struct attribute_group *sysfs_groups[3];
/* rtnetlink link ops */
const struct rtnl_link_ops *rtnl_link_ops; //rtnetlink操作函数
/* VLAN feature mask */
unsigned long vlan_features; //虚拟局域网相关
/* for setting kernel sock attribute on TCP connection setup */
#define GSO_MAX_SIZE 65536
unsigned int gso_max_size; //GSO最大值
#ifdef CONFIG_DCB
/* Data Center Bridging netlink ops */
struct dcbnl_rtnl_ops *dcbnl_ops; //DCB操作函数
#endif
#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
/* max exchange id for FCoE LRO by ddp */
unsigned int fcoe_ddp_xid;
#endif
};
0x03 总结
在学习过程中,发现相关代码都有良好的注释。仅仅需要做下翻译,然后理解其目的。