linux-4.4.6内核，接收UDP大包，导致网络阻塞、卡死

u010936265

已于 2023-05-19 09:55:58 修改

阅读量1.4k

点赞数 1

分类专栏： linux网络文章标签：网络 linux udp

于 2023-05-02 11:45:26 首次发布

本文链接：https://blog.csdn.net/u010936265/article/details/130454300

版权

linux网络专栏收录该内容

5 篇文章 0 订阅

订阅专栏

简介

记录分片包占用内存大小的变量——struct percpu_counter;

简介

对struct percpu_counter进行加 / 减和读的接口函数

网络协议栈操作struct percpu_counter的函数和阈值

查看当前系统下分片包占用内存的数量

命令

/proc/net/sockstat相关代码

出问题时的现象：查看分片包占用内存的数量是0

网络协议栈接收分片包的大致流程

数理分析

解决方法

简介

linux内核网络协议栈在发送大于MTU的UDP数据包时，会将大包拆成小包发出去，对应的，在接收端，网络协议栈会将小包组合、还原成原来的大包。

本文将“分段后的 IP 封包“称为分片包。

在接收端，收到分片包后，会先分配内存空间用于存放收到的分片包。等到一个大包的所有分片包都接收完整后，执行组包操作，将其组合、还原成原来的大包，并将分片包占用的内存释放。

需要注意的是，内核协议栈会限制接收到的分片包占用内存的数量。限制的条件如下：

ipfrag_high_thresh - INTEGER
    Maximum memory used to reassemble IP fragments. When 
    ipfrag_high_thresh bytes of memory is allocated for this purpose,
    the fragment handler will toss packets until ipfrag_low_thresh
    is reached. This also serves as a maximum limit to namespaces
    different from the initial one.

ipfrag_low_thresh - INTEGER
    Maximum memory used to reassemble IP fragments before the kernel
    begins to remove incomplete fragment queues to free up resources.
    The kernel still accepts new fragments for defragmentation.

在kernel-4.4.6下，内核协议栈使用了struct percpu_counter结构体来记录分片包占用内存的数量。网络阻塞、卡死问题就出在对struct percpu_counter的使用上。本文将分析具体机理。

请注意：新版内核已经用原子变量替换 struct percpu_counter来记录分片包占用内存的数量。

记录分片包占用内存大小的变量——struct percpu_counter;

简介

struct netns_frags {
    /* The percpu_counter "mem" need to be cacheline aligned.
     *  mem.count must not share cacheline with other writers
     */
    struct percpu_counter   mem ____cacheline_aligned_in_smp;  //记录分片包占用内存大小的变量

    /* sysctls */
    int         timeout;
    int         high_thresh;
    int         low_thresh;
};

struct percpu_counter的定义如下：

struct percpu_counter {
    raw_spinlock_t lock;
    s64 count;
#ifdef CONFIG_HOTPLUG_CPU
    struct list_head list;  /* All percpu_counters are on a list */
#endif
    s32 __percpu *counters;
};

每处理器计数器 (struct percpu_counter) 的设计思想是：计数器有一个总的计数值 count ，每个处理器有一个临时计数值 counters ，每个处理器先把计数累加到自己的临时计数值，当临时计数值达到或超过阈值 ( 当前是 130000) 的时候，把临时计数值累加到总的计数值。

成员 count 是总的计数值，成员 lock 用来保护总的计数值，成员 counters 指向每处理器变量，每个处理器对应一个临时计数值。

《Linux 内核深度解析》P486

对struct percpu_counter进行加 / 减和读的接口函数

void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch)  //加 和 减 使用同一个函数
{
    s64 count;

    preempt_disable();
    count = __this_cpu_read(*fbc->counters) + amount;
    if (count >= batch || count <= -batch) {
        unsigned long flags;
        raw_spin_lock_irqsave(&fbc->lock, flags);
        fbc->count += count;
        __this_cpu_sub(*fbc->counters, count - amount);
        raw_spin_unlock_irqrestore(&fbc->lock, flags);
    } else {
        this_cpu_add(*fbc->counters, amount);
    }   
    preempt_enable();
}
EXPORT_SYMBOL(__percpu_counter_add);


static inline s64 percpu_counter_read(struct percpu_counter *fbc)
{
    return fbc->count;
}

网络协议栈操作struct percpu_counter的函数和阈值

/* Memory Tracking Functions. */

/* The default percpu_counter batch size is not big enough to scale to
 * fragmentation mem acct sizes.
 * The mem size of a 64K fragment is approx:
 *  (44 fragments * 2944 truesize) + frag_queue struct(200) = 129736 bytes
 */
static unsigned int frag_percpu_counter_batch = 130000;

static inline int frag_mem_limit(struct netns_frags *nf)
{
    return percpu_counter_read(&nf->mem);
}

static inline void sub_frag_mem_limit(struct netns_frags *nf, int i)
{
    __percpu_counter_add(&nf->mem, -i, frag_percpu_counter_batch);
}

static inline void add_frag_mem_limit(struct netns_frags *nf, int i)
{
    __percpu_counter_add(&nf->mem, i, frag_percpu_counter_batch);
}

查看当前系统下分片包占用内存的数量

命令

# cat /proc/net/sockstat
sockets: used 1093
TCP: inuse 11 orphan 0 tw 0 alloc 15 mem 2
UDP: inuse 12 mem 3
UDPLITE: inuse 0
RAW: inuse 0
FRAG: inuse 0 memory 0        //分片包占用内存的数量

/proc/net/sockstat相关代码

/*
 *  Report socket allocation statistics [mea@utu.fi]
 */
static int sockstat_seq_show(struct seq_file *seq, void *v) 
{
    struct net *net = seq->private;
    unsigned int frag_mem;
    int orphans, sockets;

    local_bh_disable();
    orphans = percpu_counter_sum_positive(&tcp_orphan_count);
    sockets = proto_sockets_allocated_sum_positive(&tcp_prot);
    local_bh_enable();

    socket_seq_show(seq);
    seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
           sock_prot_inuse_get(net, &tcp_prot), orphans,
           atomic_read(&tcp_death_row.tw_count), sockets,
           proto_memory_allocated(&tcp_prot));
    seq_printf(seq, "UDP: inuse %d mem %ld\n",
           sock_prot_inuse_get(net, &udp_prot),
           proto_memory_allocated(&udp_prot));
    seq_printf(seq, "UDPLITE: inuse %d\n",
           sock_prot_inuse_get(net, &udplite_prot));
    seq_printf(seq, "RAW: inuse %d\n",
           sock_prot_inuse_get(net, &raw_prot));
    frag_mem = ip_frag_mem(net);        // *******************获取分片包占用的内存数量
    seq_printf(seq,  "FRAG: inuse %u memory %u\n", !!frag_mem, frag_mem);
    return 0;
}

ip_frag_mem()函数相关代码如下：

//net/ipv4/ip_fragment.c
int ip_frag_mem(struct net *net)
{
    return sum_frag_mem_limit(&net->ipv4.frags);
}

//include/net/inet_frag.h
static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf)
{
    unsigned int res;

    local_bh_disable();
    res = percpu_counter_sum_positive(&nf->mem);
    local_bh_enable();

    return res;
}

出问题时的现象：查看分片包占用内存的数量是0

在接收UDP大包导致阻塞、卡死时，通过/proc/net/sockstat文件看到分片包占用的内存数量是0。

网络协议栈接收分片包的大致流程

数理分析

由于struct percpu_counter的特性，网络协议栈在接收大、小包混合的UDP数据时，会偶发阻塞、卡死。举例如下：

        假设high_thresh 的值是 40960，并且当前没有接收到任何分片包数据(struct percpu_counter的count=0, counters=0)。
        马上，0号核收到了 105000 字节大小的一批分片包，1号核收到了总长 100000 字节的一批分片包，即:
                count= 0
                counters(0号核)= 105000
                counters(1号核)= 100000

        紧接着，0号核接收到的 30000 字节的一批分片包，30000 + 105000 = 135000，135000大于阈值 130000，所以要把0号核的counters的值累加到 count，并且0号核的conters清零，即:
                count= 135000   (count > high_thresh，停止接收新的封包的分片包)
                counters(0号核)= 0
                counters(1号核)= 100000

        而在0号核刚刚接收到的30000字节的分片包与之前接收到的分片包正好可以组成一个完整的封包，组成一个完整封包的分片包大小是120000字节，在组完包后，0号核的counters变成-120000，即：
                count= 135000   (count > high_thresh，停止接收新的封包的分片包)
                counters(0号核)= -120000
                counters(1号核)= 100000

        然后，1号核接收到2000字节的一批分片包，即：
                count= 135000   (count > high_thresh，停止接收新的封包的分片包)
                counters(0号核)= -120000
                counters(1号核)= 102000

        而在1号核刚刚接收到的2000字节的分片包中，正好可以和之前的已经接收到的，剩下的所有分片包组成一个封包。现存的所有分片包大小是117000字节，在组完包后，1号核的counters要减去117000，即：
                count= 135000   (count > high_thresh，停止接收新的封包的分片包)
                counters(0号核)= -120000
                counters(1号核)= 102000 - 117000 = -15000