linux磁盘IO打满时，write文件被卡住问题排查

ปรัชญา แค้วคำมูล

已于 2024-03-04 20:31:58 修改

阅读量837

点赞数 8

分类专栏： linux 文件系统性能优化文章标签： linux 性能优化

于 2024-01-23 17:36:26 首次发布

本文链接：https://blog.csdn.net/John_ToStr/article/details/135777542

版权

性能优化同时被 3 个专栏收录

20 篇文章 0 订阅

订阅专栏

linux

18 篇文章 0 订阅

订阅专栏

文件系统

1 篇文章 0 订阅

订阅专栏

文章分析了lazytime挂载选项对系统卡顿的优化效果，通过源码研究发现，虽然在某些调用栈中提及该选项，但在实际应用中并未观察到优化效果，因为关键判断条件未在特定路径上触发。同时讨论了磁盘分区对卡顿的影响，但相关内容暂未更新。

摘要由CSDN通过智能技术生成

目的

1. 确认lazytime挂载选项是否对卡顿有优化效果，刨析对应源码寻找关联触发关系。

2. 确认磁盘分区是否能够缓解卡顿

lazytime选项

源码分析

xviver代码---对应内核版本4.19
/**
 *    do_remount_sb - asks filesystem to change mount options.
 *    @sb:    superblock in question
 *    @flags:    numeric part of options
 *    @data:    the rest of options
 *      @force: whether or not to force the change
 *
 *    Alters the mount options of a mounted file system.
 */
int do_remount_sb(struct super_block *sb, int flags, void *data, int force)

/*
 * change filesystem flags. dir should be a physical root of filesystem.
 * If you've mounted a non-root directory somewhere and want to do remount
 * on it - tough luck.
 */
static int do_remount(struct path *path, int flags, int mnt_flags,
              void *data)

/*
 * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
 * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
 *
 * data is a (void *) that can point to any structure up to
 * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
 * information (or be NULL).
 *
 * Pre-0.97 versions of mount() didn't have a flags word.
 * When the flags word was introduced its top half was required
 * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
 * Therefore, if this magic number is present, it carries no information
 * and must be discarded.
 */
long do_mount(const char *dev_name, const char __user *dir_name,
        const char *type_page, unsigned long flags, void *data_page)
  
SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
        char __user *, type, unsigned long, flags, void __user *, data)

static int ext4_remount(struct super_block *sb, int *flags, char *data) data参数存储了挂载选项
static int handle_mount_opt(struct super_block *sb, char *opt, int token,
                substring_t *args, unsigned long *journal_devnum,
                unsigned int *journal_ioprio, int is_remount);
case Opt_lazytime: sb->s_flags |= MS_LAZYTIME;  这行修改了flag


触发调用位置1，对应调用栈如下(这个调用栈目前抓不到了，可能已经解决了，或者磁盘状态对此影响因素较多)：
begin_io[1678435754675283], now[1678435754777350], stack: [<ffffff80080863bc>] __switch_to+0x9c/0xc0
begin_io[1678435754675283], now[1678435754777350], stack: [<ffffff8008353698>] do_get_write_access+0x2d8/0x5b0
begin_io[1678435754675283], now[1678435754777350], stack: [<ffffff80083539d0>] jbd2_journal_get_write_access+0x60/0x80
begin_io[1678435754675283], now[1678435754777350], stack: [<ffffff8008336ecc>] __ext4_journal_get_write_access+0x54/0x90
begin_io[1678435754675283], now[1678435754777350], stack: [<ffffff8008305448>] ext4_reserve_inode_write+0x98/0xb8
begin_io[1678435754675283], now[1678435754777350], stack: [<ffffff80083054a8>] ext4_mark_inode_dirty+0x40/0x250
begin_io[1678435754675283], now[1678435754777350], stack: [<ffffff800830ae80>] ext4_dirty_inode+0x50/0x78
begin_io[1678435754675283], now[1678435754777350], stack: [<ffffff8008292344>] __mark_inode_dirty+0x54/0x4a0
begin_io[1678435754675283], now[1678435754777350], stack: [<ffffff800827c108>] generic_update_time+0x70/0xb0
begin_io[1678435754675283], now[1678435754777350], stack: [<ffffff800827c498>] file_update_time+0xc0/0xf8
begin_io[1678435754675283], now[1678435754777350], stack: [<ffffff80081c5e8c>] __generic_file_write_iter+0x8c/0x1c0
begin_io[1678435754675283], now[1678435754777350], stack: [<ffffff80082fbd58>] ext4_file_write_iter+0xd0/0x310
begin_io[1678435754675283], now[1678435754777350], stack: [<ffffff800825b298>] __vfs_write+0xd0/0x118
begin_io[1678435754675283], now[1678435754777350], stack: [<ffffff800825c204>] vfs_write+0xac/0x1b0
begin_io[1678435754675283], now[1678435754777350], stack: [<ffffff800825d83c>] SyS_write+0x54/0xb0
begin_io[1678435754675283], now[1678435754777350], stack: [<ffffff800808395c>] __sys_trace+0x4c/0x4c

int generic_update_time(struct inode *inode, struct timespec *time, int flags)
{
    int iflags = I_DIRTY_TIME;

    if (flags & S_ATIME)
    inode->i_atime = *time;
    if (flags & S_VERSION)
    inode_inc_iversion(inode);
    if (flags & S_CTIME)
    inode->i_ctime = *time;
    if (flags & S_MTIME)
    inode->i_mtime = *time;

    if (!(inode->i_sb->s_flags & MS_LAZYTIME) || (flags & S_VERSION)) 这里用到了作为判断，很显然lazytime选项跟这个有关系
        iflags |= I_DIRTY_SYNC;
    __mark_inode_dirty(inode, iflags);
    return 0;
}

触发调用位置2，对应的调用栈：如下图

在抓到的调用栈中排查函数内部，没有找到判断s_flags & MS_LAZYTIME的位置；观察调用卡顿基本也没有改善

结论：lazytime针对现在的调用栈没有优化效果，代码没有关联关系！




ORIN代码---对应内核版本5.10，后续更新分析过程
static int ext4_fill_super(struct super_block *sb, void *data, int silent) 中设置sb对象
struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)) 这个函数中调用上面函数指针并传入对应的参数信息


这个函数才是真正触发的地方
/*
 * Handle remount.
 */
static int legacy_reconfigure(struct fs_context *fc)
{
    struct legacy_fs_context *ctx = fc->fs_private;
    struct super_block *sb = fc->root->d_sb;

    if (!sb->s_op->remount_fs)
        return 0;

    return sb->s_op->remount_fs(sb, &fc->sb_flags,
                    ctx ? ctx->legacy_data : NULL);
}

/**
 * reconfigure_super - asks filesystem to change superblock parameters
 * @fc: The superblock and configuration
 *
 * Alters the configuration parameters of a live superblock.
 */
int reconfigure_super(struct fs_context *fc)

/*
 * change filesystem flags. dir should be a physical root of filesystem.
 * If you've mounted a non-root directory somewhere and want to do remount
 * on it - tough luck.
 */
static int do_remount(struct path *path, int ms_flags, int sb_flags,
              int mnt_flags, void *data)

int path_mount(const char *dev_name, struct path *path,
        const char *type_page, unsigned long flags, void *data_page)
  
long do_mount(const char *dev_name, const char __user *dir_name,
        const char *type_page, unsigned long flags, void *data_page)

加write+fsync相对于仅仅write卡顿时间相对长一些

工控机

实车

基于以上加lazytime挂载选项

工控机

实车

磁盘分区

待更新！

测试代码

#include <stdio.h>
#include <stdio.h>
#include <fcntl.h>
#include <sys/time.h>
#include <unistd.h>
#include <pthread.h>
#include <sys/syscall.h>
#include <string.h>
#include <atomic>

#define Latency_Threshold 100000  // io 卡住 100ms，触发记录栈信息
#define Check_Interval 1000       // 每隔 1ms 检测一次是否卡住

typedef struct IOINFO {
    int max_delay; // globale 记录过去时间未查询的最大的io延迟
    unsigned int delayCount; // 累积值
} IOINFO;

void get_log_file_name(char* file_path, char* prefix, char *suffix)
{
    char new_name[256];
    time_t now = time(NULL);
    char prefix_c[100];
    memset(prefix_c, 0, 100);
    strncpy(prefix_c, prefix, strlen(prefix));
    strncpy(prefix_c+strlen(prefix), suffix, strlen(suffix));
    strcat(prefix_c, "_%Y_%m_%d.log");
    strftime(new_name, sizeof(new_name), prefix_c, localtime(&now));
    strcpy(file_path, new_name);
}

char write_per_sec_log[256];
char stack_log[256];
const int fd_mode = O_RDWR | O_CREAT;

IOINFO info = {0, 0};

/*
    程序一直在跑，如何判断卡顿发生在那一个时间内，也就是我们的监控程序1s输出一次信息
    1s内发生了卡顿，几次还是卡顿的最大值比较好
*/

std::atomic<unsigned long> timeUs;


// 获取的时间是基于系统时钟的，而不是单调递增的时间。这意味着如果系统时间被调整（例如通过NTP同步），
// gettimeofday()返回的时间也会相应地改变。
unsigned long get_timestamp_us() {
    struct timeval t;
    gettimeofday(&t, 0);
    return (unsigned long)((unsigned long)t.tv_sec * 1000000 + t.tv_usec);
}

// 返回的单位是微妙
unsigned long get_monotonic_us_posix() {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return ((unsigned long)ts.tv_sec) * 1000000 + ts.tv_nsec / 1000;
}

// TODO: 检查时间跳变 物理时间差 : 系统时间差 有无跳变 (考虑优先级)cpu调度
// 1. jingzhen cpu晶振时间
// 2. xitong
// 3. jingzhen
unsigned long get_realtime_us_posix() {
    struct timespec ts;
    clock_gettime(CLOCK_REALTIME, &ts);
    return ((unsigned long)ts.tv_sec) * 1000000 + ts.tv_nsec / 1000;
}

// 从 stack_filename 文件读取栈信息，写入 fw
void record_stack(FILE *fw, char *stack_filename, unsigned long begin_io, unsigned long now, char *line, int line_cap) {
    FILE *fr = fopen(stack_filename, "r");
    if(fr == NULL)
        fprintf(fw, "begin_io[%lu], now[%lu], fr == NULL\n", begin_io, now);

    while(!feof(fr))
    {
        fgets(line, line_cap, fr);
        fprintf(fw, "begin_io[%lu], now[%lu], stack: %s", begin_io, now, line);
    }
    fclose(fr);
}

void record_stack_end(FILE *fw, unsigned long begin_io, unsigned long end_io) {
    fprintf(fw, "io end. total[%lu], begin_io[%lu], end_io[%lu]\n\n", end_io-begin_io, begin_io, end_io);
}

// 在死循环中不断 check 主线程中 io 的起止时间的 atomic 变量
// 找到开始时间后，check 当前 io 耗时
// 找到结束时间后，复原 check 环境
void* thread_check_background(void *arg) {
    FILE *fw = fopen(stack_log, "w+");
    if(fw == NULL) {
        fprintf(fw, "open stack.log failed\n");
        return NULL;
    }

    unsigned long main_pid = (unsigned long)arg;
    char stack_filename[100];
    memset(stack_filename, 0, sizeof(stack_filename));
    sprintf(stack_filename, "/proc/self/task/%ld/stack", main_pid);

    char line[1024];

    unsigned long t[2];
    unsigned long tmp;
    memset(t, 0x00, sizeof(t));
    unsigned int index = 0;
    unsigned long io_us = 0;
    unsigned long now_us = 0;

    while(1) {
        // tmp = atomic_load_explicit(&timeUs, memory_order_seq_cst);
        tmp = timeUs.load();
        index = tmp & 1;
        /*
            这里解释为什么可以这样获取index，及这里的主要逻辑
            : index = 1 tmp 为end_time
            : index = 0 tmp 为beg_time
                1. tmp记录的是触发写的beg_time或者触发写完之后的end_time时间, << 1 + 1 -> end_time
                2. 频率快于写，可以在一个写周期内循环检查写完的时间，可以细化时间粒度
                分为以下三种情况:
                    1> 马上就写完了，来不及检查，所以一直可以看到t[0] == 0 && t[1] = end_time, 此时直接丢弃改次检查，视为正常
                    2> 检测到开始写和写完两个时间，但是在正常时间完成的  t[1] - t[0] < 100ms
                    3> 发现写完的时间差>100ms, 记录一些时间点和延迟时长 t[1] - t[0] >= 100ms
        */
        if(tmp>>1 == t[index]) { // 前后 2 次 check，发现值一样
            if(index == 0) {  // t[0] 前后 2 次 check，值相同，此时很可能写 io 卡住了，检测一下
                now_us = get_timestamp_us();
                io_us = now_us - t[index];
                if(io_us > Latency_Threshold) { // 本次 io 卡住了
                    record_stack(fw, stack_filename, t[index], now_us, line, sizeof(line));
                }
            }
        } else {  // 前后 2 次 check，发现值不相等
            if(index == 0) {
                // 计算当前 io 耗时
                t[index] = tmp>>1;
            } else if(index == 1) {
                if(t[0] != 0) {  // t[0] 和 t[1] 都是有效值
                    t[index] = tmp>>1;
                    if(t[1] - t[0] > Latency_Threshold) {
                        info.delayCount++;
                        info.max_delay = t[1]-t[0] > info.max_delay ? t[1]-t[0] : info.max_delay;
                        record_stack_end(fw, t[0], t[1]);
                    }
                    // 复原 check 环境
                    t[0] = 0;
                    t[1] = 0;
                } else {  // t[0] 是无效值，只有 t[1] 存在，此时要复原 check 环境。走到这里的 2 种情况：
                    // 1.主线程写 io 不卡，耗时很短，会走到这里，根本来不及检测到就消失了
                    // 2.复原 check 环境后，主线程休眠了 50ms，此时 timeUs 还是旧数据
                    t[0] = 0;
                    t[1] = 0;
                }
            }
        }
        usleep(Check_Interval); // 0.1ms
    }
}

int IOBlockTest(char* prefix) {
    get_log_file_name(write_per_sec_log, prefix, "/write_sec_per");
    get_log_file_name(stack_log, prefix, "/stack");

    unsigned long pid = syscall(SYS_gettid);
    pthread_t thread;
    if (pthread_create(&thread, NULL, thread_check_background, (void*)pid) != 0) {
        printf("pthread_create failed\n");
        return 0;
    }

    unsigned long now1, now2;
    char line[1024];
    memset(line, 0, sizeof(line));

    // FILE *fw = fopen(write_per_sec_log, "w+");
    int fd = open(write_per_sec_log, fd_mode, 0666);

    if(fd == -1) {
        printf("fd == -1. open %s failed\n", write_per_sec_log);
        return 0;
    }

    char buf[100];
    while(1) {
        now1 = get_realtime_us_posix();

        // atomic_store_explicit(&timeUs, now1<<1, memory_order_seq_cst);
        timeUs.store(now1<<1);

        /*
            这里模拟io的耗时操作
        */
        int size_n = sprintf(buf, "realtime: %lu monotonic: %lu \n", now1, get_monotonic_us_posix());
        write(fd, buf, size_n);
        // The fsync() function shall request that all data for the open file descriptor named by fildes is to be transferred to the storage device associated with the file described by fildes.
        // The nature of the transfer is implementation-defined. The fsync() function shall not return until the system has completed that action or until an error is detected.
        fsync(fd);

        now2 = get_realtime_us_posix();
        // atomic_store_explicit(&timeUs, (now2<<1)+1, memory_order_seq_cst);
        timeUs.store((now2<<1)+1);

        // usleep(50000);  // 50ms
        usleep(100);  // 50ms
    }
    close(fd);
    // pthread_join
    return 0;
}

// int IOBlockTest(const char* prefix) {
int main()
{
  IOBlockTest("./");
  return 0;
}