Linux块设备驱动(四)————块设备的数据结构与相关操作及I/O调度器

0、数据从内存到磁盘的过程
内存是一个线性的结构,Linux系统将内存分为页。一页最大可以是64KB,但是目前主流的系统页的大小都是4KB。每一页的数据会被先封装成一个段,用bio_vec表示。多个页会被封装成多个段,这些段被组成以一个bio_vec为元素的数组,这个数组用bio_io_vec表示。
bio_io_vec是bio中的一个指针。一个或者多个bio会组成一个request请求描述符。request将被连接到请求队列request_queue中,或者被合并到已经有的请求队列request_queue已有的request中。合并的条件是两个相邻的request请求所表示的扇区位置相邻。最后这个请求队列被处理,将数据写入磁盘。
这里写图片描述

1、块I/O请求(bio)
数据从内存到磁盘或者从磁盘到内存的过程,叫做I/O操作。内核使用一个核心数据结构来描述I/O操作。bio结构包含了一个段的数据(bio_io_vec),这个段的数据就是要操作的数据。


/*
 * main unit of I/O for the block layer and lower layers (ie drivers and
 * stacking drivers)
 */
struct bio {
    /*要传送的第一个扇区*/
    sector_t        bi_sector;  /* device address in 512 byte
                           sectors */
    /*下一个扇区*/
    struct bio      *bi_next;   /* request queue link */
    struct block_device *bi_bdev;   /*bio对应的块设备*/
    unsigned long       bi_flags;   /* status, command, etc */
    unsigned long       bi_rw;      /* bottom bits READ/WRITE,
                         * top bits priority
                         */

    unsigned short      bi_vcnt;    /* how many bio_vec's */
    unsigned short      bi_idx;     /* current index into bvl_vec */

    /* Number of segments in this BIO after
     * physical address coalescing is performed.
     */
    unsigned short      bi_phys_segments;

    /* Number of segments after physical and DMA remapping
     * hardware coalescing is performed.
     */
    unsigned short      bi_hw_segments;
    unsigned int        bi_size;    /* residual I/O count */
    /*
     * To keep track of the max hw size, we account for the
     * sizes of the first and last virtually mergeable segments
     * in this bio
     */
    unsigned int        bi_hw_front_size;
    unsigned int        bi_hw_back_size;
    unsigned int        bi_max_vecs;    /* max bvl_vecs we can hold */
    struct bio_vec      *bi_io_vec; /* the actual vec list */
    bio_end_io_t        *bi_end_io;
    atomic_t        bi_cnt;     /* pin count */
    void            *bi_private;
    bio_destructor_t    *bi_destructor; /* destructor */
};

这里写图片描述

与bio相关的宏

/*于获取目前的页指针*/
 bio_page(bio)  
/*用于获取目前的页的偏移*/
bio_offset(bio) 
bio_cur_sectors(bio)    

2、请求结构(request)
几个连续的页面会组成一个bio结构,几个相邻的bio结构就会组成一个请求结构(request)。这样就不需要大幅度移动磁头了,节省了I/O操作的时间。


/*
 * try to put the fields that are referenced together in the same cacheline
 */
 /*请求结构request*/
struct request {
    struct list_head queuelist; /*请求队列request_queue链表*/
    struct list_head donelist;

    request_queue_t *q;

    unsigned int cmd_flags;
    enum rq_cmd_type_bits cmd_type;

    /* Maintain bio traversal state for part by part I/O submission.
     * hard_* are block layer internals, no driver should touch them!
     */
    /*要传送的第一个扇区号*/
    sector_t sector;        /* next sector to submit */
    /*要传送的下一个扇区*/
    sector_t hard_sector;       /* next sector to complete */
    unsigned long nr_sectors;   /* no. of sectors left to submit */
    unsigned long hard_nr_sectors;  /* no. of sectors left to complete */
    /* no. of sectors left to submit in the current segment */
    unsigned int current_nr_sectors;

    /* no. of sectors left to complete in the current segment */
    unsigned int hard_cur_sectors;
    struct bio *bio; /*指向第一个未完成的bio结构域*/
    struct bio *biotail;/*请求链表中最后一个bio*/
    struct hlist_node hash; /* merge hash */
    /*
     * The rb_node is only used inside the io scheduler, requests
     * are pruned when moved to the dispatch queue. So let the
     * completion_data share space with the rb_node.
     */
    union {
        struct rb_node rb_node; /* sort/lookup */
        void *completion_data;
    };

    /*
     * two pointers are available for the IO schedulers, if they need
     * more they have to dynamically allocate it.
     */
    void *elevator_private;  /*指向I/O调度器的私有数据1*/
    void *elevator_private2;/*指向I/O调度器的私有数据2*/
    struct gendisk *rq_disk;  /*指向请求所指向的磁盘*/
    unsigned long start_time;

    /* Number of scatter-gather DMA addr+len pairs after
     * physical address coalescing is performed.
     */
    unsigned short nr_phys_segments;/*请求的物理段数*/

    /* Number of scatter-gather addr+len pairs after
     * physical and DMA remapping hardware coalescing is performed.
     * This is the number of scatter-gather entries the driver
     * will actually have to deal with after DMA mapping is done.
     */
    unsigned short nr_hw_segments;
    unsigned short ioprio;
    void *special;
    char *buffer;
    int tag;
    int errors;
    int ref_count;

    /*
     * when request is used as a packet command carrier
     */
    unsigned int cmd_len;
    unsigned char cmd[BLK_MAX_CDB];

    unsigned int data_len;
    unsigned int sense_len;
    void *data;
    void *sense;

    unsigned int timeout;
    int retries;

    /*
     * completion callback.
     */
    rq_end_io_fn *end_io;
    void *end_io_data;
};

3、请求队列(request_queue)
请求队列主要是用来连接对同一块设备的多个request请求结构。还包含块设备所支持的请求类型信息、请求的个数、段的大小、硬件扇区数等与设备相关的信息。


/*内核将请求队列request_queue设计为一个双向链表,
链接request 请求*/
struct request_queue
{
    /*
     * Together with queue_head for cacheline sharing
     */
    struct list_head    queue_head;/*连接到request结构,
                                表示待处理的请求*/
    struct request      *last_merge;
    elevator_t      *elevator;  /*电梯调度算法的指针*/

    /*
     * the queue request freelist, one for reads and one for writes
     */
    struct request_list rq;/*为分配请求描述符使用的数据结构*/

    /*实现驱动程序处理请求的函数*/
    request_fn_proc     *request_fn;
    /*将一个新的request请求插入请求队列中的方法*/
    make_request_fn     *make_request_fn;
    prep_rq_fn      *prep_rq_fn;
    unplug_fn       *unplug_fn;
    merge_bvec_fn       *merge_bvec_fn;
    issue_flush_fn      *issue_flush_fn;
    prepare_flush_fn    *prepare_flush_fn;
    softirq_done_fn     *softirq_done_fn;

    /*
     * Dispatch queue sorting
     */
    sector_t        end_sector;
    struct request      *boundary_rq;

    /*
     * Auto-unplugging state
     */
    struct timer_list   unplug_timer;
    int         unplug_thresh;  /* After this many requests */
    unsigned long       unplug_delay;   /* After this many jiffies */
    struct work_struct  unplug_work;

    struct backing_dev_info backing_dev_info;

    /*
     * The queue owner gets to use this for whatever they like.
     * ll_rw_blk doesn't touch it.
     */
    void            *queuedata;/*指向块设备驱动程序的私有数据的指针*/

    /*
     * queue needs bounce pages for pages above this limit
     */
    unsigned long       bounce_pfn;
    gfp_t           bounce_gfp;

    /*
     * various queue flags, see QUEUE_* below
     */
    unsigned long       queue_flags;

    /*
     * protects queue structures from reentrancy. ->__queue_lock should
     * _never_ be used directly, it is queue private. always use
     * ->queue_lock.
     */
    spinlock_t      __queue_lock;
    spinlock_t      *queue_lock;

    /*
     * queue kobject
     */
    struct kobject kobj;

    /*
     * queue settings
     */
    unsigned long       nr_requests;    /* Max # of requests */
    unsigned int        nr_congestion_on;
    unsigned int        nr_congestion_off;
    unsigned int        nr_batching;

    unsigned int        max_sectors;
    unsigned int        max_hw_sectors;
    unsigned short      max_phys_segments;
    unsigned short      max_hw_segments;
    unsigned short      hardsect_size;
    unsigned int        max_segment_size;

    unsigned long       seg_boundary_mask;
    unsigned int        dma_alignment;

    struct blk_queue_tag    *queue_tags;

    unsigned int        nr_sorted;
    unsigned int        in_flight;

    /*
     * sg stuff
     */
    unsigned int        sg_timeout;
    unsigned int        sg_reserved_size;
    int         node;
#ifdef CONFIG_BLK_DEV_IO_TRACE
    struct blk_trace    *blk_trace;
#endif
    /*
     * reserved for flush operations
     */
    unsigned int        ordered, next_ordered, ordseq;
    int         orderr, ordcolor;
    struct request      pre_flush_rq, bar_rq, post_flush_rq;
    struct request      *orig_bar_rq;
    unsigned int        bi_size;

    struct mutex        sysfs_lock;
};

4、总结
请求队列(request_queue)、请求结构(request)、bio等之间的关系
这里写图片描述
这里写图片描述

5、四种调度算法(电梯算法)
内核需要一种调度,使物理相邻的请求尽可能先后执行,这样就可以减少寻找扇区的时间,这种调度就叫做I/O调度。
这里写图片描述
预期算法(Anticipatory)、最后期限算法、CFQ完全公平队列算法、Noop无操作算法。

  • 1
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值