/*
* try to put the fields that are referenced togetherin the same cacheline.
* if you modify this structure, be sureto check block/blk-core.c:rq_init()
* as
*/
struct request {
struct list_head queuelist; //链表结构
struct call_single_data csd;
struct request_queue *q; //现在用request_queue 来表示需要完成的任务。
unsigned int cmd_flags;
enum rq_cmd_type_bits cmd_type;
unsigned long atomic_flags;
int cpu;
/* the following two fields are internal, NEVER access directly*/
unsigned int __data_len; /* total datalen */
sector_t __sector; /* sector cursor*/ //要传送的下一个扇区
struct bio *bio; //请求bio结构体的链表。
struct bio *biotail; //请求bio结构体的链表尾。
struct hlist_node hash; /* merge hash*/
/*
* The rb_node is only used inside the io scheduler, requests
* are pruned when moved to the dispatch queue. So let the
* completion_data share space with the rb_node.
*/
union {
struct rb_node rb_node; /* sort/lookup*/
void *completion_data;
};
/*
* Three pointers are available for the IO schedulers, if they need
* more they have to dynamically allocate it.
*/
void *elevator_private;
void *elevator_private2;
void *elevator_private3;
struct gendisk *rq_disk;
unsigned long start_time;
#ifdef CONFIG_BLK_CGROUP
unsigned long long start_time_ns;
unsigned long long io_start_time_ns; /* when passed to hardware */
#endif
/* Number of scatter-gather DMA addr+len pairs after
* physical address coalescing is performed.
*/
unsigned short nr_phys_segments; //请求在物理内存中占据不连续的段得数目,scatter/gather列表的尺寸。
#if defined(CONFIG_BLK_DEV_INTEGRITY)
unsigned short nr_integrity_segments;
#endif
unsigned short ioprio;
int ref_count; //引用计数
void *special; /* opaque pointer availablefor LLD use */
char *buffer; /* kaddr of the current segmentif available *///指向缓冲区的指针,传送的缓冲区,内核虚拟地址。数据应当被传送到或来自这个缓冲区。
int tag;
int errors;
/*
* when request is used as a packet command carrier
*/
unsigned char __cmd[BLK_MAX_CDB];
unsigned char *cmd;
unsigned short cmd_len;
unsigned int extra_len; /* length of alignmentand padding */
unsigned int sense_len;
unsigned int resid_len; /* residual count*/
void *sense;
unsigned long deadline;
struct list_head timeout_list;
unsigned int timeout;
int retries;
/*
* completion callback.
*/
rq_end_io_fn *end_io;
void *end_io_data;
/* for bidi */
struct request *next_rq;
};
- 2. 请求队列 request_queue
struct request_queue { /* * Together with queue_head for cacheline sharing */ struct list_head queue_head; struct request *last_merge; struct elevator_queue *elevator; /* * the queue request freelist, onefor reads and onefor writes */ struct request_list rq; request_fn_proc *request_fn; make_request_fn *make_request_fn; prep_rq_fn *prep_rq_fn; unprep_rq_fn *unprep_rq_fn; unplug_fn *unplug_fn; merge_bvec_fn *merge_bvec_fn; softirq_done_fn *softirq_done_fn; rq_timed_out_fn *rq_timed_out_fn; dma_drain_needed_fn *dma_drain_needed; lld_busy_fn *lld_busy_fn; /* * Dispatch queue sorting */ sector_t end_sector; struct request *boundary_rq; /* * Auto-unplugging state */ struct timer_list unplug_timer; int unplug_thresh; /* After this many requests*/ unsigned long unplug_delay; /* After this many jiffies*/ struct work_struct unplug_work; struct backing_dev_info backing_dev_info; /* * The queue owner gets to use this for whatever they like. * ll_rw_blk doesn't touch it. */ void *queuedata; /* * queue needs bounce pages for pages above this limit */ gfp_t bounce_gfp; /* * various queue flags, see QUEUE_* below */ unsigned long queue_flags; /* * protects queue structures from reentrancy.->__queue_lock should * _never_ be used directly, itis queue private. always use * ->queue_lock. */ spinlock_t __queue_lock; spinlock_t *queue_lock; /* * queue kobject */ struct kobject kobj; /* * queue settings */ unsigned long nr_requests; /* Max # of requests*/ unsigned int nr_congestion_on; unsigned int nr_congestion_off; unsigned int nr_batching; void *dma_drain_buffer; unsigned int dma_drain_size; unsigned int dma_pad_mask; unsigned int dma_alignment; struct blk_queue_tag *queue_tags; struct list_head tag_busy_list; unsigned int nr_sorted; unsigned int in_flight[2]; unsigned int rq_timeout; struct timer_list timeout; struct list_head timeout_list; struct queue_limits limits; /* * sg stuff */ unsigned int sg_timeout; unsigned int sg_reserved_size; int node; #ifdef CONFIG_BLK_DEV_IO_TRACE struct blk_trace *blk_trace; #endif /* * for flush operations */ unsigned int flush_flags; unsigned int flush_seq; int flush_err; struct request flush_rq; struct request *orig_flush_rq; struct list_head pending_flushes; struct mutex sysfs_lock; #if defined(CONFIG_BLK_DEV_BSG) struct bsg_class_device bsg_dev; #endif #ifdef CONFIG_BLK_DEV_THROTTLING /* Throttle data*/ struct throtl_data *td; #endif };
- a.初始化请求队列
request_fn_proc*rfn, //请求处理的函数指针。
spinlock_t *lock //控制访问队列权限的自旋锁。
struct request_queue*blk_init_queue(request_fn_proc*rfn, spinlock_t*lock)
{
return blk_init_queue_node(rfn, lock,-1);
}
EXPORT_SYMBOL(blk_init_queue);
struct request_queue *
blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
{
struct request_queue *uninit_q,*q;
uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id);
if (!uninit_q)
return NULL;
q = blk_init_allocated_queue_node(uninit_q, rfn, lock, node_id);
if (!q)
blk_cleanup_queue(uninit_q);
return q;
}
EXPORT_SYMBOL(blk_init_queue_node);
b. 清除请求队列:
void blk_cleanup_queue(struct request_queue*q)
{
/*
* We know we have process context here, so we can be a little
* cautious and ensure that pending block actions on this device
* are done before moving on. Going into this function, we should
* not have processes doing IOto this device.
*/
blk_sync_queue(q);
del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
mutex_lock(&q->sysfs_lock);
queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
mutex_unlock(&q->sysfs_lock);
if (q->elevator)
elevator_exit(q->elevator);
blk_put_queue(q);
}
EXPORT_SYMBOL(blk_cleanup_queue);
- c.分配“请求队列”
struct request_queue*blk_alloc_queue(gfp_t gfp_mask) { return blk_alloc_queue_node(gfp_mask,-1); } EXPORT_SYMBOL(blk_alloc_queue); struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask,int node_id) { struct request_queue *q; int err; q = kmem_cache_alloc_node(blk_requestq_cachep, gfp_mask | __GFP_ZERO, node_id); if (!q) return NULL; q->backing_dev_info.unplug_io_fn= blk_backing_dev_unplug; q->backing_dev_info.unplug_io_data= q; q->backing_dev_info.ra_pages= (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; q->backing_dev_info.state= 0; q->backing_dev_info.capabilities= BDI_CAP_MAP_COPY; q->backing_dev_info.name= "block"; err = bdi_init(&q->backing_dev_info); if (err){ kmem_cache_free(blk_requestq_cachep, q); return NULL; } if (blk_throtl_init(q)){ kmem_cache_free(blk_requestq_cachep, q); return NULL; } setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, laptop_mode_timer_fn, (unsigned long) q); init_timer(&q->unplug_timer); setup_timer(&q->timeout, blk_rq_timed_out_timer,(unsigned long) q); INIT_LIST_HEAD(&q->timeout_list); INIT_LIST_HEAD(&q->pending_flushes); INIT_WORK(&q->unplug_work, blk_unplug_work); kobject_init(&q->kobj,&blk_queue_ktype); mutex_init(&q->sysfs_lock); spin_lock_init(&q->__queue_lock); return q; } EXPORT_SYMBOL(blk_alloc_queue_node);
对于flash,ram等完全随机访问的非机械设备,并不需要进行复杂的I/O调度,这时候用:上述函数分配一个“请求队列”,并使用如下函数来帮定请求队列和“制造请求”函数。/** * blk_queue_make_request - define an alternate make_request functionfor a device * @q: the request queuefor the device to be affected * @mfn: the alternate make_requestfunction * * Description: * The normal way for &struct bios to be passed to a device * driver isfor them to be collected into requestson a request * queue,and then to allow the device driver to select requests * off that queue when it is ready. This works well for many block * devices. However some block devices(typically virtual devices * such as md or lvm) donot benefit from the processing on the * request queue,and are served best by having the requests passed * directly to them. This can be achieved by providing afunction * to blk_queue_make_request(). * * Caveat: * The driver that does this *must* be able to deal appropriately * with buffers in "highmemory". This can be accomplished by either calling * __bio_kmap_atomic()to get a temporary kernel mapping,or by calling * blk_queue_bounce()to create a buffer in normal memory. **/ void blk_queue_make_request(struct request_queue*q, make_request_fn*mfn) { /* * set defaults */ q->nr_requests= BLKDEV_MAX_RQ; q->make_request_fn= mfn; blk_queue_dma_alignment(q, 511); blk_queue_congestion_threshold(q); q->nr_batching= BLK_BATCH_REQ; q->unplug_thresh= 4; /* hmm*/ q->unplug_delay= msecs_to_jiffies(3); /* 3 milliseconds*/ if (q->unplug_delay== 0) q->unplug_delay= 1; q->unplug_timer.function= blk_unplug_timeout; q->unplug_timer.data= (unsigned long)q; blk_set_default_limits(&q->limits); blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS); /* * If the caller didn't supply a lock, fall backto our embedded * per-queue locks */ if (!q->queue_lock) q->queue_lock= &q->__queue_lock; /* * by default assume old behaviour and bounce for any highmem page */ blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH); } EXPORT_SYMBOL(blk_queue_make_request);
d.提取请求
struct request *elv_next_request(request_queue_t *queue)
e.去除请求
void blk_dequeue_request(struct request*rq) { struct request_queue *q = rq->q; BUG_ON(list_empty(&rq->queuelist)); BUG_ON(ELV_ON_HASH(rq)); list_del_init(&rq->queuelist); /* * the time frame between a request being removed from the lists * andto it is freedis accounted as io that is in progress at * the driver side. */ if (blk_account_rq(rq)){ q->in_flight[rq_is_sync(rq)]++; set_io_start_time_ns(rq); } }
f. 启动,停止请求队列/** * blk_start_queue - restart a previously stopped queue * @q: The&struct request_queue in question * * Description: * blk_start_queue() will clear the stop flagon the queue,and call * the request_fn for the queue if it was in a stopped state when * entered. Also see blk_stop_queue(). Queue lock must be held. **/ void blk_start_queue(struct request_queue *q) { WARN_ON(!irqs_disabled()); queue_flag_clear(QUEUE_FLAG_STOPPED, q); __blk_run_queue(q); } EXPORT_SYMBOL(blk_start_queue); /** * blk_stop_queue - stop a queue * @q: The&struct request_queue in question * * Description: * The Linux block layer assumes that a block driver will consume all * entries on the request queue when the request_fn strategyis called. * Often this will not happen, because of hardware limitations(queue * depth settings).If a device driver gets a 'queue full' response, * or if it simply chooses not to queue more I/O at one point, it can * call thisfunction to prevent the request_fn from being calleduntil * the driver has signalled it's readyto go again. This happens by calling * blk_start_queue()to restart queue operations. Queue lock must be held. **/ void blk_stop_queue(struct request_queue *q) { blk_remove_plug(q); queue_flag_set(QUEUE_FLAG_STOPPED, q); } EXPORT_SYMBOL(blk_stop_queue);
- g.参数设置
h.通告内核
blk_queue_bounce_limit()
blk_queue_segment_boundary()
blk_queue_dma_alignment()
blk_queue_hardsect_size()
3. I/O调度器
linux2.6内核包含4个I/O调度器,
No-op I/O scheduler
Anticipatory I/O scheduler: 当前内核默认I/O调度器。
Deadline I/O scheduler:
CFQ I/O scheduler:
内核block目录中的
noop-iosched.c
as-iosched.c
deadline-iosched.c
cfq-iosched.c
分别实现了上述调度算法。
可以通过给kernel添加启动参数,来选择使用IO调度算法:如
kernel elevator=deadline
4. bio
3.I/O调度器
linux2.6内核包含4个I/O调度器,
No-op I/O scheduler
Anticipatory I/O scheduler: 当前内核默认I/O调度器。
Deadline I/O scheduler:
CFQ I/O scheduler:
内核block目录中的
noop-iosched.c
as-iosched.c
deadline-iosched.c
cfq-iosched.c
分别实现了上述调度算法。
可以通过给kernel添加启动参数,来选择使用IO调度算法: 如
kernel elevator=deadline
4. bio
/* * main unit of I/Ofor the block layer and lower layers (ie drivers and * stacking drivers) */ struct bio { sector_t bi_sector; /* device addressin 512 byte //要传输的第一个扇区。 sectors */ struct bio *bi_next; /* request queuelink */ //下一个bio struct block_device *bi_bdev; unsigned long bi_flags; /* status, command, etc*/ unsigned long bi_rw; /* bottom bits READ/WRITE, * top bits priority//低位表示READ/WRITE,高位表示优先级 */ unsigned short bi_vcnt; /* how many bio_vec's*/ unsigned short bi_idx; /* current index into bvl_vec*/ /* Number of segmentsin this BIO after * physical address coalescing is performed. */ unsigned int bi_phys_segments; //不相邻的物理段数目 unsigned int bi_size; /* residual I/O count*/ //以字节为单位,所需传输的数据大小 /* * To keep track of the max segment size, we accountfor the * sizes of the first and last mergeable segments in this bio. */ unsigned int bi_seg_front_size; unsigned int bi_seg_back_size; unsigned int bi_max_vecs; /* max bvl_vecs we can hold*/ unsigned int bi_comp_cpu; /* completion CPU*/ atomic_t bi_cnt; /* pin count*/ struct bio_vec *bi_io_vec; /* the actual vec list*/ bio_end_io_t *bi_end_io; void *bi_private; #if defined(CONFIG_BLK_DEV_INTEGRITY) struct bio_integrity_payload *bi_integrity;/* data integrity*/ #endif bio_destructor_t *bi_destructor; /* destructor*/ /* * We can inline a number of vecs at theend of the bio,to avoid * double allocations for a small number of bio_vecs. This member * MUST obviously be kept at the very end of the bio. */ struct bio_vec bi_inline_vecs[0]; }; /* * was unsigned short, but we might as well be readyfor > 64kB I/O pages */ struct bio_vec { struct page *bv_page; //页指针 unsigned int bv_len; //传输的字节数 unsigned int bv_offset; //偏移位置 };
内核操作bio的函数(宏):
a. int bio_data_dir(struct bio *bio) //获得数据传输方向是READ还是WRITE。
- /*
- * return data direction, READor WRITE
- */
- #define bio_data_dir(bio) ((bio)->bi_rw& 1)
b. struct page *bio_page(struct bio *bio); //获得当前的页指针。
- #define bio_page(bio) bio_iovec((bio))->bv_page
c. int bio_offset(struct bio * bio) // 返回当前操作对应的页内偏移,通常块I/O操作本身就是页面对齐。
- #define bio_offset(bio) bio_iovec((bio))->bv_offset
d. int bio_cur_sectors(struct bio *bio); //返回当前bio_vec要传输的扇区数。
- #define bio_sectors(bio) ((bio)->bi_size>> 9) //应该是改成了这个
e. char *bio_data(struct bio *bio); //返回数据缓冲区的内核虚拟地址。
static inline void *bio_data(struct bio*bio)
{
if (bio->bi_vcnt)
return page_address(bio_page(bio))+ bio_offset(bio);
return NULL;
}
f. char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags);
void bvec_kunmap_irq(char *buffer, unsigned long *flags);
//返回一个内核虚拟地址,该地址可用于存取被给定的bio_vec入口指向的数据缓冲区。
//它也会中断并返回一个原子kmap。
/* * remember never ever reenable interrupts between a bvec_kmap_irqand * */ static inline char *bvec_kmap_irq(struct bio_vec*bvec, unsigned long*flags) { unsigned long addr; /* * might not be a highmem page, but the preempt/irq count * balancing is a lot nicer this way */ local_irq_save(*flags); addr = (unsigned long) kmap_atomic(bvec->bv_page, KM_BIO_SRC_IRQ); BUG_ON(addr & ~PAGE_MASK); return (char *) addr + bvec->bv_offset; } static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags) { unsigned long ptr = (unsigned long) buffer & PAGE_MASK; kunmap_atomic((void*) ptr, KM_BIO_SRC_IRQ); local_irq_restore(*flags); } #else static inline char *bvec_kmap_irq(struct bio_vec*bvec, unsigned long*flags) { return page_address(bvec->bv_page)+ bvec->bv_offset; } static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags) { *flags = 0; } #endif static inline char *__bio_kmap_irq(struct bio*bio, unsigned short idx, unsigned long *flags) { return bvec_kmap_irq(bio_iovec_idx(bio, idx), flags); } #define __bio_kunmap_irq(buf, flags) bvec_kunmap_irq(buf, flags) #define bio_kmap_irq(bio, flags)\ //返回给定的bio的当前bio_vec入口的映射。 __bio_kmap_irq((bio),(bio)->bi_idx,(flags)) #define bio_kunmap_irq(buf,flags) __bio_kunmap_irq(buf, flags)
g. char *__bio_kmap_atomic(struct bio *bio, int i, enum km_type type); //通过kmap_atomic()获得返回给定bio的第i个缓冲区的虚拟地址。
void __bio_kunmap_atomic(char *addr, enum km_type type); //释放由__bio_kmap_atomic()获得的内核虚拟地址。
2.6.37内核好像已经没了。
h. 对bio的引用计数:
void bio_get(struct bio *bio); //引用bio
void bio_put(struct bio *bio); //释放bio的引用。
/** * bio_put - release a referenceto a bio * @bio: bioto release reference to * * Description: * Put a reference to a &struct bio, either one you have gotten with * bio_alloc, bio_getor bio_clone. The last put of a bio will free it. **/ void bio_put(struct bio *bio) { BIO_BUG_ON(!atomic_read(&bio->bi_cnt)); /* * last put frees it */ if (atomic_dec_and_test(&bio->bi_cnt)){ bio->bi_next= NULL; bio->bi_destructor(bio); } } EXPORT_SYMBOL(bio_put); /* * get a referenceto a bio, so it won't disappear. the intended useis * something like: * * bio_get(bio); * submit_bio(rw, bio); * if (bio->bi_flags...) * do_something * bio_put(bio); * * without the bio_get(), it could potentially complete I/O before submit_bio * returns.and then bio would be freed memory whenif (bio->bi_flags...) * runs */ #define bio_get(bio) atomic_inc(&(bio)->bi_cnt)