全局数组tcp_timeouts定义了默认的各个状态下TCP连接的超时时长。其中连接建立状态下的空闲超时时长最长,为5天。
static const unsigned int tcp_timeouts[TCP_CONNTRACK_TIMEOUT_MAX] = {
[TCP_CONNTRACK_SYN_SENT] = 2 MINS,
[TCP_CONNTRACK_SYN_RECV] = 60 SECS,
[TCP_CONNTRACK_ESTABLISHED] = 5 DAYS,
[TCP_CONNTRACK_FIN_WAIT] = 2 MINS,
[TCP_CONNTRACK_CLOSE_WAIT] = 60 SECS,
[TCP_CONNTRACK_LAST_ACK] = 30 SECS,
[TCP_CONNTRACK_TIME_WAIT] = 2 MINS,
[TCP_CONNTRACK_CLOSE] = 10 SECS,
[TCP_CONNTRACK_SYN_SENT2] = 2 MINS,
/* RFC1122 says the R2 limit should be at least 100 seconds.
Linux uses 15 packets as limit, which corresponds
to ~13-30min depending on RTO. */
[TCP_CONNTRACK_RETRANS] = 5 MINS,
[TCP_CONNTRACK_UNACK] = 5 MINS,
};
如下连接跟踪的TCP超时时长是基于命名空间的。timeouts数组的第一个元素,对应于TCP连接状态TCP_CONNTRACK_NONE,并没有用,这里将其设置为SYN_SENT状态对应的超时时长。
void nf_conntrack_tcp_init_net(struct net *net)
{
struct nf_tcp_net *tn = nf_tcp_pernet(net);
int i;
for (i = 0; i < TCP_CONNTRACK_TIMEOUT_MAX; i++)
tn->timeouts[i] = tcp_timeouts[i];
/* timeouts[0] is unused, make it same as SYN_SENT so
* ->timeouts[0] contains 'new' timeout, like udp or icmp.
*/
tn->timeouts[0] = tcp_timeouts[TCP_CONNTRACK_SYN_SENT];
TCP连接超时调整
在新建一个连接结构nf_conn时,如果模板连接不为空,并且其timeout扩展不为空,将模板的timeout扩展添加为新建连接的timeout扩展。
static noinline struct nf_conntrack_tuple_hash *
init_conntrack(struct net *net, struct nf_conn *tmpl,
const struct nf_conntrack_tuple *tuple,
struct sk_buff *skb,
unsigned int dataoff, u32 hash)
{
struct nf_conn *ct;
ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, hash);
if (IS_ERR(ct))
return (struct nf_conntrack_tuple_hash *)ct;
timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
if (timeout_ext)
nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout), GFP_ATOMIC);
对于TCP连接,如果连接没有timeout扩展,使用命名空间默认的timeout值。
int nf_conntrack_tcp_packet(struct nf_conn *ct,
struct sk_buff *skb,
unsigned int dataoff,
enum ip_conntrack_info ctinfo,
const struct nf_hook_state *state)
{
struct net *net = nf_ct_net(ct);
struct nf_tcp_net *tn = nf_tcp_pernet(net);
enum tcp_conntrack new_state, old_state;
unsigned int index, *timeouts;
timeouts = nf_ct_timeout_lookup(ct);
if (!timeouts)
timeouts = tn->timeouts;
接下来,进行TCP连接的超时时长调整。
1)重传次数超过最大值(tcp_max_retrans默认为3),超时时长不能大于RETRANS状态的超时时长。tcp_max_retrans的值可通过文件/proc/sys/net/netfilter/nf_conntrack_tcp_max_retrans进行修改;
2)当前处理的报文为RST复位报文,应用CLOSE状态的超时时长;
3)某个方向有未确认的数据报文,超时时长不能大于UNACK状态对应的超时时长;
4)有一方通告了零窗口,超时时长不能大于RETRANS对应的超时时长;
5)以上都不成立的情况下,使用新状态所对应的超时时长。
if (ct->proto.tcp.retrans >= tn->tcp_max_retrans &&
timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS])
timeout = timeouts[TCP_CONNTRACK_RETRANS];
else if (unlikely(index == TCP_RST_SET))
timeout = timeouts[TCP_CONNTRACK_CLOSE];
else if ((ct->proto.tcp.seen[0].flags | ct->proto.tcp.seen[1].flags) &
IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED &&
timeouts[new_state] > timeouts[TCP_CONNTRACK_UNACK])
timeout = timeouts[TCP_CONNTRACK_UNACK];
else if (ct->proto.tcp.last_win == 0 &&
timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS])
timeout = timeouts[TCP_CONNTRACK_RETRANS];
else
timeout = timeouts[new_state];
如果对端回复的首个报文为复位报文,立即删除连接,不考虑超时时长。否则,如果新状态为ESTABLISHED,只在tcp_loose为1的情况下,根据非握手过程报文创建的TCP连接,由于ESTABLISHED状态对应的超时时长太长了,在这种连接建立报文不完整的情况下,超时时长不能大于UNACK对应的超时时长。
if (!test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
/* If only reply is a RST, we can consider ourselves not to
have an established connection: this is a fairly common
problem case, so we can delete the conntrack
immediately. --RR */
if (th->rst) {
nf_ct_kill_acct(ct, ctinfo, skb);
return NF_ACCEPT;
}
/* ESTABLISHED without SEEN_REPLY, i.e. mid-connection
* pickup with loose=1. Avoid large ESTABLISHED timeout.
*/
if (new_state == TCP_CONNTRACK_ESTABLISHED &&
timeout > timeouts[TCP_CONNTRACK_UNACK])
timeout = timeouts[TCP_CONNTRACK_UNACK];
否则,如果原本的TCP连接状态等于SYN_RECV或者ESTABLISHED,并且新状态等于ESTABLISHED,此时,如果连接跟踪的状态没有设置确认位IPS_ASSURED_BIT,设置确认位。最后,函数nf_ct_refresh_acct更新连接的超时时长。
} else if (!test_bit(IPS_ASSURED_BIT, &ct->status)
&& (old_state == TCP_CONNTRACK_SYN_RECV
|| old_state == TCP_CONNTRACK_ESTABLISHED)
&& new_state == TCP_CONNTRACK_ESTABLISHED) {
/* Set ASSURED if we see valid ack in ESTABLISHED
after SYN_RECV or a valid answer for a picked up
connection. */
set_bit(IPS_ASSURED_BIT, &ct->status);
nf_conntrack_event_cache(IPCT_ASSURED, ct);
}
nf_ct_refresh_acct(ct, ctinfo, skb, timeout);
实际调用函数__nf_ct_refresh_acct更新超时时长。
static inline void nf_ct_refresh_acct(struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
const struct sk_buff *skb,
u32 extra_jiffies)
{
__nf_ct_refresh_acct(ct, ctinfo, skb, extra_jiffies, true);
}
对于冲突连接其超时时长固定为1秒钟,参见nf_ct_resolve_clash_harder函数,不进行超时时长的更新。否则更新连接的timeout值。
void __nf_ct_refresh_acct(struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
const struct sk_buff *skb,
u32 extra_jiffies,
bool do_acct)
{
/* Only update if this is not a fixed timeout */
if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
goto acct;
/* If not in hash table, timer will not be active yet */
if (nf_ct_is_confirmed(ct))
extra_jiffies += nfct_time_stamp;
if (READ_ONCE(ct->timeout) != extra_jiffies)
WRITE_ONCE(ct->timeout, extra_jiffies);
acct:
if (do_acct)
nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len);
}
连接超时处理
如果连接的超时时刻小于当前时刻,则认为连接已经超时。
static inline bool nf_ct_is_expired(const struct nf_conn *ct)
{
return (__s32)(ct->timeout - nfct_time_stamp) <= 0;
}
static inline bool nf_ct_should_gc(const struct nf_conn *ct)
{
return nf_ct_is_expired(ct) && nf_ct_is_confirmed(ct) &&
!nf_ct_is_dying(ct);
}
函数nf_ct_kill负责删除连接相关结构,而函数nf_ct_put负责释放连接。在开始之前,需要先增加引用计数,保障nf_ct_kill的执行。
static void nf_ct_gc_expired(struct nf_conn *ct)
{
if (!atomic_inc_not_zero(&ct->ct_general.use))
return;
if (nf_ct_should_gc(ct))
nf_ct_kill(ct);
nf_ct_put(ct);
超时处理位置
连接回收函数中,定期回收超时的连接,其不超过1/8秒就会运行一次。
unsigned int max = GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV;
遍历nf_conntrack_hash哈希链表,如果连接已经超时,进行回收nf_ct_gc_expired。如果连接设置了卸载标志IPS_OFFLOAD_BIT,不做超时处理。
static void gc_worker(struct work_struct *work)
{
unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u);
struct conntrack_gc_work *gc_work;
do {
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_head *ct_hash;
struct hlist_nulls_node *n;
struct nf_conn *tmp;
i++;
nf_conntrack_get_ht(&ct_hash, &hashsz);
if (i >= hashsz) i = 0;
hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
struct net *net;
tmp = nf_ct_tuplehash_to_ctrack(h);
scanned++;
if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
nf_ct_offload_timeout(tmp);
continue;
}
if (nf_ct_is_expired(tmp)) {
nf_ct_gc_expired(tmp);
expired_count++;
continue;
}
对于连接卸载的情况,将连接的超时时长最小设置为24小时。
#define NF_CT_DAY (86400 * HZ)
/* Set an arbitrary timeout large enough not to ever expire, this save
* us a check for the IPS_OFFLOAD_BIT from the packet path via
* nf_ct_is_expired().
*/
static inline void nf_ct_offload_timeout(struct nf_conn *ct)
{
if (nf_ct_expires(ct) < NF_CT_DAY / 2)
ct->timeout = nfct_time_stamp + NF_CT_DAY;
}
在新建连接时,内核会先查找是否存在相同的连接,在遍历查找过程中,同时检测连接是否超时,进行超时处理。
static struct nf_conntrack_tuple_hash *
____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone,
const struct nf_conntrack_tuple *tuple, u32 hash)
{
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_head *ct_hash;
struct hlist_nulls_node *n;
unsigned int bucket, hsize;
begin:
nf_conntrack_get_ht(&ct_hash, &hsize);
bucket = reciprocal_scale(hash, hsize);
hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) {
struct nf_conn *ct;
ct = nf_ct_tuplehash_to_ctrack(h);
if (nf_ct_is_expired(ct)) {
nf_ct_gc_expired(ct);
continue;
}
if (nf_ct_key_equal(h, tuple, zone, net))
return h;
}
连接跟踪的NAT模块,在检测tuple是否已存在时,也需要遍历nf_conntrack_hash链表,所以在遍历过程中,也进行了连接超时处理。
/* Returns true if a connection correspondings to the tuple (required
for NAT). */
int
nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
const struct nf_conn *ignored_conntrack)
{
struct net *net = nf_ct_net(ignored_conntrack);
const struct nf_conntrack_zone *zone;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_head *ct_hash;
struct hlist_nulls_node *n;
zone = nf_ct_zone(ignored_conntrack);
begin:
nf_conntrack_get_ht(&ct_hash, &hsize);
hash = __hash_conntrack(net, tuple, hsize);
hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) {
ct = nf_ct_tuplehash_to_ctrack(h);
if (ct == ignored_conntrack) continue;
if (nf_ct_is_expired(ct)) {
nf_ct_gc_expired(ct);
continue;
}
if (nf_ct_key_equal(h, tuple, zone, net)) {
内核版本 5.10