1 概述
连接跟踪模块用于维护可跟踪协议的连接状态,即连接跟踪针对特定协议的报文进行处理,而不是所有的协议报文。其为每一个经过网络协议栈的数据包,生成一个新的连接记录项 。此后,所有属于此连接的数据包都被唯一地分配给这个连接,并标识连接的状态。连接跟踪是状态防火墙和NAT的实现基础。
2 框架
netfilter各个模块注册在hook点上的hook函数示意图:
从上图可以看出连接跟踪模块注册的钩子函数在netfilter的钩子点以及所处优先级
三种不同方向的报文经过连接跟踪模块的处理过程:
发往本机的报文:
本机发出的报文:
有本机转发的报文:
3 核心代码分析
3.1 相关数据结构
3.1.1 struct nf_conntrack_tuple{}:元组(tuple)
元组是一个包含能识别唯一连接的信息的结构体,如果两个报文有同一个元组,则它们属于同一连接。
元组结构体nf_conntrack_tuple的定义如下:
struct nf_conntrack_tuple {
struct nf_conntrack_man src;//源地址信息
/* These are the parts of the tuple which are fixed. */
struct {
union nf_inet_addr u3;
union {
/* Add other protocols here. */
__be16 all;
struct { __be16 port; } tcp;
struct { __be16 port; } udp;
struct { u_int8_t type, code; } icmp;
struct { __be16 port; } dccp;
struct { __be16 port; } sctp;
struct { __be16 key; } gre;
} u;
/* The protocol. */
u_int8_t protonum;
/* The direction (for tuplehash) */
u_int8_t dir;
} dst;//目的地址信息
};
以IPV4 UDP为例:
dst.u3.ip 目的IP地址
dst.u.udp.port 目的端口
src.u3.ip 源IP地址
src.u.udp.port 源端口
dst.protonum 协议号
从该结构体的定义可以看出目前连接跟踪模块只支持六种协议:
TCP,UDP,ICMP,DCCP,SCTP,GRE
3.1.2 struct nf_conn{}:连接跟踪
连接nf_conn定义一个flow,核心成员是tuplehash[IP_CT_DIR_ORIGINAL]和tuplehash[IP_CT_DIR_REPLY]
连接结构体nf_conn的定义如下:
struct nf_conn {
/* Usage count in here is 1 for hash table, 1 per skb,
* plus 1 for any connection(s) we are `master' for
*
* Hint, SKB address this struct and refcnt via skb->_nfct and
* helpers nf_conntrack_get() and nf_conntrack_put().
* Helper nf_ct_put() equals nf_conntrack_put() by dec refcnt,
* beware nf_ct_get() is different and don't inc refcnt.
*/
struct nf_conntrack ct_general; //连接跟踪引用次数
spinlock_t lock;
u16 cpu;
#ifdef CONFIG_NF_CONNTRACK_ZONES
struct nf_conntrack_zone zone;
#endif
/* XXX should I move this to the tail ? - Y.K */
/* These are my tuples; original and reply */
//该连接的元组信息,tuple和reply_tuple
struct nf_conntrack_tuple_hash tuplehash[IP_CT_DIR_MAX];
/* Have we seen traffic both ways yet? (bitset) */
unsigned long status; //连接跟踪状态,可设置成enum ip_conntrack_status里定义的值
/* jiffies32 when this ct is considered dead */
u32 timeout; //超时时间
possible_net_t ct_net;
#if IS_ENABLED(CONFIG_NF_NAT)
struct hlist_node nat_bysource;
#endif
/* all members below initialized via memset */
struct { } __nfct_init_offset;
/* If we were expected by an expectation, this will be it */
struct nf_conn *master; //如果该连接为子连接,master指向其主连接
#if defined(CONFIG_NF_CONNTRACK_MARK)
u_int32_t mark;
#endif
#ifdef CONFIG_NF_CONNTRACK_SECMARK
u_int32_t secmark;
#endif
#ifdef CONFIG_FWOS_HOSTBW
u_int32_t fwosmark;
#endif
/* Extensions */
struct nf_ct_ext *ext; //连接跟踪扩展
/* Storage reserved for other modules, must be the last member */
union nf_conntrack_proto proto; //保存不同协议的私有数据
};
3.1.3 struct nf_conntrack_l4proto{}:四层协议处理函数
包含对应协议在连接跟踪维护过程中所特有的一些处理函数,
nf_conntrack_l4proto定义如下:
struct nf_conntrack_l4proto {
/* L3 Protocol number. */
u_int16_t l3proto;
/* L4 Protocol number. */
u_int8_t l4proto;
/* Resolve clashes on insertion races. */
bool allow_clash;
/* protoinfo nlattr size, closes a hole */
u16 nlattr_size;
/* Try to fill in the third arg: dataoff is offset past network protocol
hdr. Return true if possible. */
//根据skb获取tuple,只有icmp要特殊处理
bool (*pkt_to_tuple)(const struct sk_buff *skb, unsigned int dataoff,
struct net *net, struct nf_conntrack_tuple *tuple);
/* Invert the per-proto part of the tuple: ie. turn xmit into reply.
* Only used by icmp, most protocols use a generic version.
*/
//根据tuple获取reply_tuple
bool (*invert_tuple)(struct nf_conntrack_tuple *inverse,
const struct nf_conntrack_tuple *orig);
/* Returns verdict for packet, or -1 for invalid. */
//报文的特殊处理,比如给连接加超时时间,new()之后调用
int (*packet)(struct nf_conn *ct,
const struct sk_buff *skb,
unsigned int dataoff,
enum ip_conntrack_info ctinfo);
/* Called when a new connection for this protocol found;
* returns TRUE if it's OK. If so, packet() called next. */
bool (*new)(struct nf_conn *ct, const struct sk_buff *skb,
unsigned int dataoff);
/* Called when a conntrack entry is destroyed */
void (*destroy)(struct nf_conn *ct);
//判断该协议报文是否存在问题
int (*error)(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
unsigned int dataoff,
u_int8_t pf, unsigned int hooknum);
/* called by gc worker if table is full */
bool (*can_early_drop)(const struct nf_conn *ct);
/* convert protoinfo to nfnetink attributes */
int (*to_nlattr)(struct sk_buff *skb, struct nlattr *nla,
struct nf_conn *ct);
/* convert nfnetlink attributes to protoinfo */
int (*from_nlattr)(struct nlattr *tb[], struct nf_conn *ct);
int (*tuple_to_nlattr)(struct sk_buff *skb,
const struct nf_conntrack_tuple *t);
/* Calculate tuple nlattr size */
unsigned int (*nlattr_tuple_size)(void);
int (*nlattr_to_tuple)(struct nlattr *tb[],
struct nf_conntrack_tuple *t);
const struct nla_policy *nla_policy;
struct {
int (*nlattr_to_obj)(struct nlattr *tb[],
struct net *net, void *data);
int (*obj_to_nlattr)(struct sk_buff *skb, const void *data);
u16 obj_size;
u16 nlattr_max;
const struct nla_policy *nla_policy;
} ctnl_timeout;
#ifdef CONFIG_NF_CONNTRACK_PROCFS
/* Print out the private part of the conntrack. */
void (*print_conntrack)(struct seq_file *s, struct nf_conn *);
#endif
unsigned int *net_id;
/* Init l4proto pernet data */
int (*init_net)(struct net *net, u_int16_t proto);
/* Return the per-net protocol part. */
struct nf_proto_net *(*get_net_proto)(struct net *net);
/* Module (if any) which this is connected to. */
struct module *me;
};
3.1.4 连接跟踪哈希表:struct hlist_nulls_head *nf_conntrack_hash
连接跟踪哈希表的结构如下:
连接跟踪哈希表nf_conntrack_hash的buckets数量为nf_conntrack_htable_size,最大桶深为4,即能存储的最大连接跟踪表项数为nf_conntrack_htable_size*4。每个表项的结构体为struct nf_conntrack_tuple_hash。
tuple计算哈希值hash,通过hash找到对应的bucket,bucket里查找对应的结点hnode,找到tuple对应的连接跟踪表项。
skb、nf_conn、nf_conntrack_hash的关系图:
skb提取tuple,tuple生成hash值,通过hash值在连接跟踪表里找到连接跟踪项,通过连接跟踪项获取连接nf_conn(方法:container_of),通过nf_ct_set把连接存到skb的成员_nfct。_nfct保存的是连接的地址和状态的或值。
3.1.5 四层协议处理函数存储数组:struct nf_conntrack_l4proto **nf_ct_protos[]
nf_ct_protos的存储结构图如下:
3.2 核心函数
3.2.1 nf_conntrack_in
报文从PRE_ROUTING钩子点进入连接跟踪模块调用的钩子函数是ipv4_conntrack_in,其核心函数为nf_conntrack_in
nf_conntrack_in主流程图:
3.2.2 ipv4_confirm
报文通过POST_ROUTING钩子点出连接跟踪模块时调用的钩子函数为ipv4_confirm
3.2 连接跟踪建立连接流程
3.2.1 连接跟踪建立过程中的状态变化
4 后记
此文档是基于linux内核代码版本4.19.180所写。侧重点是IPV4的连接跟踪实现框架,IPV6、ARP、BRIDGE等未特意说明。
另,连接跟踪模块里涉及到的连接跟踪扩展以及期望连接的实现将单独讲述,不在此文档体现。