13_2_request与bio结构体

最新推荐文章于 2023-11-11 18:33:08 发布

lamdoc

最新推荐文章于 2023-11-11 18:33:08 发布

阅读量1k

点赞数

分类专栏： Block device 文章标签： struct destructor timer list alignment bi

本文链接：https://blog.csdn.net/lamdoc/article/details/7684570

版权

Block device 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

1.request

linux块设备驱动中，用request结构体来表征等待进行的I/O请求。

位于include/linux/blkdev.h中

/*

 * try to put the fields that are referenced togetherin the same cacheline.

 * if you modify this structure, be sureto check block/blk-core.c:rq_init()

 * as 

 */

struct request {

    struct list_head queuelist;   //链表结构

    struct call_single_data csd;  



    struct request_queue *q;   //现在用request_queue 来表示需要完成的任务。



    unsigned int cmd_flags;

    enum rq_cmd_type_bits cmd_type;

    unsigned long atomic_flags;



    int cpu;



    /* the following two fields are internal, NEVER access directly*/

    unsigned int __data_len;    /* total datalen */

    sector_t __sector;        /* sector cursor*/ //要传送的下一个扇区



    struct bio *bio;    //请求bio结构体的链表。

    struct bio *biotail;  //请求bio结构体的链表尾。



    struct hlist_node hash;    /* merge hash*/

    /*

     * The rb_node is only used inside the io scheduler, requests

     * are pruned when moved to the dispatch queue. So let the

     * completion_data share space with the rb_node.

     */

    union {

        struct rb_node rb_node;    /* sort/lookup*/

        void *completion_data;

    };



    /*

     * Three pointers are available for the IO schedulers, if they need

     * more they have to dynamically allocate it.

     */

    void *elevator_private;

    void *elevator_private2;

    void *elevator_private3;



    struct gendisk *rq_disk;

    unsigned long start_time;

#ifdef CONFIG_BLK_CGROUP

    unsigned long long start_time_ns;

    unsigned long long io_start_time_ns; /* when passed to hardware */

#endif

    /* Number of scatter-gather DMA addr+len pairs after

     * physical address coalescing is performed.

     */

    unsigned short nr_phys_segments;  //请求在物理内存中占据不连续的段得数目，scatter/gather列表的尺寸。

#if defined(CONFIG_BLK_DEV_INTEGRITY)

    unsigned short nr_integrity_segments;

#endif



    unsigned short ioprio;



    int ref_count;    //引用计数



    void *special;        /* opaque pointer availablefor LLD use */

    char *buffer;        /* kaddr of the current segmentif available *///指向缓冲区的指针，传送的缓冲区，内核虚拟地址。数据应当被传送到或来自这个缓冲区。



    int tag;

    int errors;



    /*

     * when request is used as a packet command carrier

     */

    unsigned char __cmd[BLK_MAX_CDB];

    unsigned char *cmd;

    unsigned short cmd_len;



    unsigned int extra_len;    /* length of alignmentand padding */

    unsigned int sense_len;

    unsigned int resid_len;    /* residual count*/

    void *sense;



    unsigned long deadline;

    struct list_head timeout_list;

    unsigned int timeout;

    int retries;



    /*

     * completion callback.

     */

    rq_end_io_fn *end_io;

    void *end_io_data;



    /* for bidi */

    struct request *next_rq;

};

2. 请求队列 request_queue

struct request_queue

{

    /*

     * Together with queue_head for cacheline sharing

     */

    struct list_head    queue_head;

    struct request        *last_merge;

    struct elevator_queue    *elevator;



    /*

     * the queue request freelist, onefor reads and onefor writes

     */

    struct request_list    rq;



    request_fn_proc        *request_fn;

    make_request_fn        *make_request_fn;

    prep_rq_fn        *prep_rq_fn;

    unprep_rq_fn        *unprep_rq_fn;

    unplug_fn        *unplug_fn;

    merge_bvec_fn        *merge_bvec_fn;

    softirq_done_fn        *softirq_done_fn;

    rq_timed_out_fn        *rq_timed_out_fn;

    dma_drain_needed_fn    *dma_drain_needed;

    lld_busy_fn        *lld_busy_fn;



    /*

     * Dispatch queue sorting

     */

    sector_t        end_sector;

    struct request        *boundary_rq;



    /*

     * Auto-unplugging state

     */

    struct timer_list    unplug_timer;

    int            unplug_thresh;    /* After this many requests*/

    unsigned long        unplug_delay;    /* After this many jiffies*/

    struct work_struct    unplug_work;



    struct backing_dev_info    backing_dev_info;



    /*

     * The queue owner gets to use this for whatever they like.

     * ll_rw_blk doesn't touch it.

     */

    void            *queuedata;



    /*

     * queue needs bounce pages for pages above this limit

     */

    gfp_t            bounce_gfp;



    /*

     * various queue flags, see QUEUE_* below

     */

    unsigned long        queue_flags;



    /*

     * protects queue structures from reentrancy.->__queue_lock should

     * _never_ be used directly, itis queue private. always use

     * ->queue_lock.

     */

    spinlock_t        __queue_lock;

    spinlock_t        *queue_lock;



    /*

     * queue kobject

     */

    struct kobject kobj;



    /*

     * queue settings

     */

    unsigned long        nr_requests;    /* Max # of requests*/

    unsigned int        nr_congestion_on;

    unsigned int        nr_congestion_off;

    unsigned int        nr_batching;



    void            *dma_drain_buffer;

    unsigned int        dma_drain_size;

    unsigned int        dma_pad_mask;

    unsigned int        dma_alignment;



    struct blk_queue_tag    *queue_tags;

    struct list_head    tag_busy_list;



    unsigned int        nr_sorted;

    unsigned int        in_flight[2];



    unsigned int        rq_timeout;

    struct timer_list    timeout;

    struct list_head    timeout_list;



    struct queue_limits    limits;



    /*

     * sg stuff

     */

    unsigned int        sg_timeout;

    unsigned int        sg_reserved_size;

    int            node;

#ifdef CONFIG_BLK_DEV_IO_TRACE

    struct blk_trace    *blk_trace;

#endif

    /*

     * for flush operations

     */

    unsigned int        flush_flags;

    unsigned int        flush_seq;

    int            flush_err;

    struct request        flush_rq;

    struct request        *orig_flush_rq;

    struct list_head    pending_flushes;



    struct mutex        sysfs_lock;



#if defined(CONFIG_BLK_DEV_BSG)

    struct bsg_class_device bsg_dev;

#endif



#ifdef CONFIG_BLK_DEV_THROTTLING

    /* Throttle data*/

    struct throtl_data *td;

#endif

};

a.初始化请求队列

request_fn_proc*rfn, //请求处理的函数指针。

spinlock_t *lock //控制访问队列权限的自旋锁。

struct request_queue*blk_init_queue(request_fn_proc*rfn, spinlock_t*lock)

{

    return blk_init_queue_node(rfn, lock,-1);

}

EXPORT_SYMBOL(blk_init_queue);



struct request_queue *

blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)

{

    struct request_queue *uninit_q,*q;



    uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id);

    if (!uninit_q)

        return NULL;



    q = blk_init_allocated_queue_node(uninit_q, rfn, lock, node_id);

    if (!q)

        blk_cleanup_queue(uninit_q);



    return q;

}

EXPORT_SYMBOL(blk_init_queue_node);

b. 清除请求队列：

void blk_cleanup_queue(struct request_queue*q)

{

    /*

     * We know we have process context here, so we can be a little

     * cautious and ensure that pending block actions on this device

     * are done before moving on. Going into this function, we should

     * not have processes doing IOto this device.

     */

    blk_sync_queue(q);



    del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);

    mutex_lock(&q->sysfs_lock);

    queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);

    mutex_unlock(&q->sysfs_lock);



    if (q->elevator)

        elevator_exit(q->elevator);



    blk_put_queue(q);

}

EXPORT_SYMBOL(blk_cleanup_queue);

c.分配“请求队列”

struct request_queue*blk_alloc_queue(gfp_t gfp_mask)

{

    return blk_alloc_queue_node(gfp_mask,-1);

}

EXPORT_SYMBOL(blk_alloc_queue);



struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask,int node_id)

{

    struct request_queue *q;

    int err;



    q = kmem_cache_alloc_node(blk_requestq_cachep,

                gfp_mask | __GFP_ZERO, node_id);

    if (!q)

        return NULL;



    q->backing_dev_info.unplug_io_fn= blk_backing_dev_unplug;

    q->backing_dev_info.unplug_io_data= q;

    q->backing_dev_info.ra_pages=

            (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;

    q->backing_dev_info.state= 0;

    q->backing_dev_info.capabilities= BDI_CAP_MAP_COPY;

    q->backing_dev_info.name= "block";



    err = bdi_init(&q->backing_dev_info);

    if (err){

        kmem_cache_free(blk_requestq_cachep, q);

        return NULL;

    }



    if (blk_throtl_init(q)){

        kmem_cache_free(blk_requestq_cachep, q);

        return NULL;

    }



    setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,

         laptop_mode_timer_fn, (unsigned long) q);

    init_timer(&q->unplug_timer);

    setup_timer(&q->timeout, blk_rq_timed_out_timer,(unsigned long) q);

    INIT_LIST_HEAD(&q->timeout_list);

    INIT_LIST_HEAD(&q->pending_flushes);

    INIT_WORK(&q->unplug_work, blk_unplug_work);



    kobject_init(&q->kobj,&blk_queue_ktype);



    mutex_init(&q->sysfs_lock);

    spin_lock_init(&q->__queue_lock);



    return q;

}

EXPORT_SYMBOL(blk_alloc_queue_node);

对于flash，ram等完全随机访问的非机械设备，并不需要进行复杂的I/O调度，这时候用：上述函数分配一个“请求队列”，并使用如下函数来帮定请求队列和“制造请求”函数。

/**

 * blk_queue_make_request - define an alternate make_request functionfor a device

 * @q: the request queuefor the device to be affected

 * @mfn: the alternate make_requestfunction

 *

 * Description:

 * The normal way for &struct bios to be passed to a device

 * driver isfor them to be collected into requestson a request

 * queue,and then to allow the device driver to select requests

 * off that queue when it is ready. This works well for many block

 * devices. However some block devices(typically virtual devices

 * such as md or lvm) donot benefit from the processing on the

 * request queue,and are served best by having the requests passed

 * directly to them. This can be achieved by providing afunction

 * to blk_queue_make_request().

 *

 * Caveat:

 * The driver that does this *must* be able to deal appropriately

 * with buffers in "highmemory". This can be accomplished by either calling

 * __bio_kmap_atomic()to get a temporary kernel mapping,or by calling

 * blk_queue_bounce()to create a buffer in normal memory.

 **/

void blk_queue_make_request(struct request_queue*q, make_request_fn*mfn)

{

    /*

     * set defaults

     */

    q->nr_requests= BLKDEV_MAX_RQ;



    q->make_request_fn= mfn;

    blk_queue_dma_alignment(q, 511);

    blk_queue_congestion_threshold(q);

    q->nr_batching= BLK_BATCH_REQ;



    q->unplug_thresh= 4;        /* hmm*/

    q->unplug_delay= msecs_to_jiffies(3);    /* 3 milliseconds*/

    if (q->unplug_delay== 0)

        q->unplug_delay= 1;



    q->unplug_timer.function= blk_unplug_timeout;

    q->unplug_timer.data= (unsigned long)q;



    blk_set_default_limits(&q->limits);

    blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);



    /*

     * If the caller didn't supply a lock, fall backto our embedded

     * per-queue locks

     */

    if (!q->queue_lock)

        q->queue_lock= &q->__queue_lock;



    /*

     * by default assume old behaviour and bounce for any highmem page

     */

    blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);

}

EXPORT_SYMBOL(blk_queue_make_request);

d.提取请求

struct request *elv_next_request(request_queue_t *queue)

e.去除请求

void blk_dequeue_request(struct request*rq)

{

    struct request_queue *q = rq->q;



    BUG_ON(list_empty(&rq->queuelist));

    BUG_ON(ELV_ON_HASH(rq));



    list_del_init(&rq->queuelist);



    /*

     * the time frame between a request being removed from the lists

     * andto it is freedis accounted as io that is in progress at

     * the driver side.

     */

    if (blk_account_rq(rq)){

        q->in_flight[rq_is_sync(rq)]++;

        set_io_start_time_ns(rq);

    }

}

f. 启动，停止请求队列

/**

 * blk_start_queue - restart a previously stopped queue

 * @q: The&struct request_queue in question

 *

 * Description:

 * blk_start_queue() will clear the stop flagon the queue,and call

 * the request_fn for the queue if it was in a stopped state when

 * entered. Also see blk_stop_queue(). Queue lock must be held.

 **/

void blk_start_queue(struct request_queue *q)

{

    WARN_ON(!irqs_disabled());



    queue_flag_clear(QUEUE_FLAG_STOPPED, q);

    __blk_run_queue(q);

}

EXPORT_SYMBOL(blk_start_queue);



/**

 * blk_stop_queue - stop a queue

 * @q: The&struct request_queue in question

 *

 * Description:

 * The Linux block layer assumes that a block driver will consume all

 * entries on the request queue when the request_fn strategyis called.

 * Often this will not happen, because of hardware limitations(queue

 * depth settings).If a device driver gets a 'queue full' response,

 * or if it simply chooses not to queue more I/O at one point, it can

 * call thisfunction to prevent the request_fn from being calleduntil

 * the driver has signalled it's readyto go again. This happens by calling

 * blk_start_queue()to restart queue operations. Queue lock must be held.

 **/

void blk_stop_queue(struct request_queue *q)

{

    blk_remove_plug(q);

    queue_flag_set(QUEUE_FLAG_STOPPED, q);

}

EXPORT_SYMBOL(blk_stop_queue);

g.参数设置

h.通告内核

blk_queue_bounce_limit()

blk_queue_segment_boundary()

blk_queue_dma_alignment()

blk_queue_hardsect_size()

3. I/O调度器

linux2.6内核包含4个I/O调度器，

No-op I/O scheduler

Anticipatory I/O scheduler: 当前内核默认I/O调度器。

Deadline I/O scheduler：

CFQ I/O scheduler：

内核block目录中的

noop-iosched.c

as-iosched.c

deadline-iosched.c

cfq-iosched.c

分别实现了上述调度算法。

可以通过给kernel添加启动参数，来选择使用IO调度算法：如

kernel elevator=deadline

4. bio

3.I/O调度器

linux2.6内核包含4个I/O调度器，

No-op I/O scheduler

Anticipatory I/O scheduler: 当前内核默认I/O调度器。

Deadline I/O scheduler：

CFQ I/O scheduler：

内核block目录中的

noop-iosched.c

as-iosched.c

deadline-iosched.c

cfq-iosched.c

分别实现了上述调度算法。

可以通过给kernel添加启动参数，来选择使用IO调度算法：如

kernel elevator=deadline

4. bio

/*

 * main unit of I/Ofor the block layer and lower layers (ie drivers and

 * stacking drivers)

 */

struct bio {

    sector_t        bi_sector;    /* device addressin 512 byte //要传输的第一个扇区。

                         sectors */

    struct bio        *bi_next;    /* request queuelink */ //下一个bio

    struct block_device    *bi_bdev;

    unsigned long        bi_flags;    /* status, command, etc*/

    unsigned long        bi_rw;        /* bottom bits READ/WRITE,

                         * top bits priority//低位表示READ/WRITE，高位表示优先级

                         */



    unsigned short        bi_vcnt;    /* how many bio_vec's*/

    unsigned short        bi_idx;        /* current index into bvl_vec*/



    /* Number of segmentsin this BIO after

     * physical address coalescing is performed.

     */

    unsigned int        bi_phys_segments; //不相邻的物理段数目



    unsigned int        bi_size;    /* residual I/O count*/ //以字节为单位，所需传输的数据大小



    /*

     * To keep track of the max segment size, we accountfor the

     * sizes of the first and last mergeable segments in this bio.

     */

    unsigned int        bi_seg_front_size;

    unsigned int        bi_seg_back_size;



    unsigned int        bi_max_vecs;    /* max bvl_vecs we can hold*/



    unsigned int        bi_comp_cpu;    /* completion CPU*/



    atomic_t        bi_cnt;        /* pin count*/



    struct bio_vec        *bi_io_vec;    /* the actual vec list*/



    bio_end_io_t        *bi_end_io;



    void            *bi_private;

#if defined(CONFIG_BLK_DEV_INTEGRITY)

    struct bio_integrity_payload *bi_integrity;/* data integrity*/

#endif



    bio_destructor_t    *bi_destructor;    /* destructor*/



    /*

     * We can inline a number of vecs at theend of the bio,to avoid

     * double allocations for a small number of bio_vecs. This member

     * MUST obviously be kept at the very end of the bio.

     */

    struct bio_vec        bi_inline_vecs[0];

};
 

/*

 * was unsigned short, but we might as well be readyfor > 64kB I/O pages

 */

struct bio_vec {

    struct page    *bv_page;   //页指针

    unsigned int    bv_len;     //传输的字节数

    unsigned int    bv_offset;    //偏移位置

};

内核操作bio的函数（宏）：

a. int bio_data_dir(struct bio *bio) //获得数据传输方向是READ还是WRITE。

/*
* return data direction, READor WRITE
*/
#define bio_data_dir(bio) ((bio)->bi_rw& 1)

b. struct page *bio_page(struct bio *bio); //获得当前的页指针。

#define bio_page(bio) bio_iovec((bio))->bv_page

c. int bio_offset(struct bio * bio) // 返回当前操作对应的页内偏移，通常块I/O操作本身就是页面对齐。

#define bio_offset(bio) bio_iovec((bio))->bv_offset

d. int bio_cur_sectors(struct bio *bio); //返回当前bio_vec要传输的扇区数。

#define bio_sectors(bio) ((bio)->bi_size>> 9) //应该是改成了这个

e. char *bio_data(struct bio *bio); //返回数据缓冲区的内核虚拟地址。

static inline void *bio_data(struct bio*bio)
{
    if (bio->bi_vcnt)
        return page_address(bio_page(bio))+ bio_offset(bio);

    return NULL;
}

f. char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags);

void bvec_kunmap_irq(char *buffer, unsigned long *flags);

//返回一个内核虚拟地址，该地址可用于存取被给定的bio_vec入口指向的数据缓冲区。

//它也会中断并返回一个原子kmap。

/*

 * remember never ever reenable interrupts between a bvec_kmap_irqand

 * 

 */

static inline char *bvec_kmap_irq(struct bio_vec*bvec, unsigned long*flags)

{

    unsigned long addr;



    /*

     * might not be a highmem page, but the preempt/irq count

     * balancing is a lot nicer this way

     */

    local_irq_save(*flags);

    addr = (unsigned long) kmap_atomic(bvec->bv_page, KM_BIO_SRC_IRQ);



    BUG_ON(addr & ~PAGE_MASK);



    return (char *) addr + bvec->bv_offset;

}



static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags)

{

    unsigned long ptr = (unsigned long) buffer & PAGE_MASK;



    kunmap_atomic((void*) ptr, KM_BIO_SRC_IRQ);

    local_irq_restore(*flags);

}



#else

static inline char *bvec_kmap_irq(struct bio_vec*bvec, unsigned long*flags)

{

    return page_address(bvec->bv_page)+ bvec->bv_offset;

}



static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags)

{

    *flags = 0;

}

#endif



static inline char *__bio_kmap_irq(struct bio*bio, unsigned short idx,

                 unsigned long *flags)

{

    return bvec_kmap_irq(bio_iovec_idx(bio, idx), flags);

}

#define __bio_kunmap_irq(buf, flags)    bvec_kunmap_irq(buf, flags)



#define bio_kmap_irq(bio, flags)\   //返回给定的bio的当前bio_vec入口的映射。

    __bio_kmap_irq((bio),(bio)->bi_idx,(flags))

#define bio_kunmap_irq(buf,flags)    __bio_kunmap_irq(buf, flags)

g. char *__bio_kmap_atomic(struct bio *bio, int i, enum km_type type); //通过kmap_atomic()获得返回给定bio的第i个缓冲区的虚拟地址。

void __bio_kunmap_atomic(char *addr, enum km_type type); //释放由__bio_kmap_atomic()获得的内核虚拟地址。

2.6.37内核好像已经没了。

h. 对bio的引用计数：

void bio_get(struct bio *bio); //引用bio

void bio_put(struct bio *bio); //释放bio的引用。

/**

 * bio_put - release a referenceto a bio

 * @bio: bioto release reference to

 *

 * Description:

 * Put a reference to a &struct bio, either one you have gotten with

 * bio_alloc, bio_getor bio_clone. The last put of a bio will free it.

 **/

void bio_put(struct bio *bio)

{

    BIO_BUG_ON(!atomic_read(&bio->bi_cnt));



    /*

     * last put frees it

     */

    if (atomic_dec_and_test(&bio->bi_cnt)){

        bio->bi_next= NULL;

        bio->bi_destructor(bio);

    }

}

EXPORT_SYMBOL(bio_put);
/*

 * get a referenceto a bio, so it won't disappear. the intended useis

 * something like:

 *

 * bio_get(bio);

 * submit_bio(rw, bio);

 * if (bio->bi_flags...)

 *    do_something

 * bio_put(bio);

 *

 * without the bio_get(), it could potentially complete I/O before submit_bio

 * returns.and then bio would be freed memory whenif (bio->bi_flags...)

 * runs

 */

#define bio_get(bio)    atomic_inc(&(bio)->bi_cnt)

lamdoc

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
13_2_request与bio结构体

1.request linux块设备驱动中，用request结构体来表征等待进行的I/O请求。位于include/linux/blkdev.h中/* * try to put the fields that are referenced togetherin the same cacheline. * if you modify this structure, be suret
复制链接

扫一扫