1 struct bio 表示一次块设备的IO请求。
2 struct request是bio(elevator)提交给IO调度器后产生的数据,一个request中存放着顺序排列的bio。当设备提交bio 给IO调度器时,新的bio可能被合并到request_queue中已有的request结构中(甚至合并到已有的bio中),或者生成新的request。request表示等待处理的块设备IO请求,结构里有sector等信息;通过queuelist把自己挂在它依附的request_queue上。
3 struct request_queue每个物理设备对应一个,是request所形成的list,其结构中有一些函数指针。
一 主要数据结构
1 gendisk
- struct gendisk {
- /* major, first_minor and minors are input parameters only,
- * don't use directly. Use disk_devt() and disk_max_parts().
- */
- int major; /* major number of driver */
- int first_minor;
- int minors; /* maximum number of minors, =1 for
- * disks that can't be partitioned. */
- char disk_name[DISK_NAME_LEN]; /* name of major driver */
- char *(*devnode)(struct gendisk *gd, umode_t *mode);
- unsigned int events; /* supported events */
- unsigned int async_events; /* async events, subset of all */
- /* Array of pointers to partitions indexed by partno.
- * Protected with matching bdev lock but stat and other
- * non-critical accesses use RCU. Always access through
- * helpers.
- */
- struct disk_part_tbl __rcu *part_tbl;
- struct hd_struct part0;
- const struct block_device_operations *fops;
- struct request_queue *queue;
- void *private_data;
- int flags;
- struct device *driverfs_dev; // FIXME: remove
- struct kobject *slave_dir;
- struct timer_rand_state *random;
- atomic_t sync_io; /* RAID */
- struct disk_events *ev;
- #ifdef CONFIG_BLK_DEV_INTEGRITY
- struct blk_integrity *integrity;
- #endif
- int node_id;
- };
major:块设备的主设备号。
first_minor:起始次设备号。
minors:描述了该块设备有多少个次设备号,或者说有多少个分区,如果minors为1,则表示该块设备没有分区。
part_tbl:整个块设备的分区信息都包含在里面,其核心结构是一个struct hd_struct的指针数组,每一项都指向一个描述分区的hd_struct结构。
fops:指向特定于设备的底层操作函数集。
queue:块设备的请求队列,所有针对该设备的请求都会放入该请求队列中,经过I/O scheduler的处理再进行提交。
2 hd_struct
保存一个分区信息,包括起始扇区,扇区数,分区号等基本信息。- struct hd_struct {
- sector_t start_sect;
- /*
- * nr_sects is protected by sequence counter. One might extend a
- * partition while IO is happening to it and update of nr_sects
- * can be non-atomic on 32bit machines with 64bit sector_t.
- */
- sector_t nr_sects;
- seqcount_t nr_sects_seq;
- sector_t alignment_offset;
- unsigned int discard_alignment;
- struct device __dev;
- struct kobject *holder_dir;
- int policy, partno;
- struct partition_meta_info *info;
- #ifdef CONFIG_FAIL_MAKE_REQUEST
- int make_it_fail;
- #endif
- unsigned long stamp;
- atomic_t in_flight[2];
- #ifdef CONFIG_SMP
- struct disk_stats __percpu *dkstats;
- #else
- struct disk_stats dkstats;
- #endif
- atomic_t ref;
- struct rcu_head rcu_head;
- };
3 disk_part_tbl
磁盘的分区表。- struct disk_part_tbl {
- struct rcu_head rcu_head;
- int len;
- struct hd_struct __rcu *last_lookup;
- struct hd_struct __rcu *part[];
- };
4 block_device
可以是整个磁盘,也可以是一个分区。如果是一个分区块设备,则bd_contains会指向分区所在磁盘的block_device,bd_part则指向分区信息结构hd_struct。- struct block_device {
- dev_t bd_dev; /* not a kdev_t - it's a search key */
- int bd_openers;
- struct inode * bd_inode; /* will die */
- struct super_block * bd_super;
- struct mutex bd_mutex; /* open/close mutex */
- struct list_head bd_inodes;
- void * bd_claiming;
- void * bd_holder;
- int bd_holders;
- bool bd_write_holder;
- #ifdef CONFIG_SYSFS
- struct list_head bd_holder_disks;
- #endif
- struct block_device * bd_contains;
- unsigned bd_block_size;
- struct hd_struct * bd_part;
- /* number of times partitions within this device have been opened. */
- unsigned bd_part_count;
- int bd_invalidated;
- struct gendisk * bd_disk;
- struct request_queue * bd_queue;
- struct list_head bd_list;
- /*
- * Private data. You must have bd_claim'ed the block_device
- * to use this. NOTE: bd_claim allows an owner to claim
- * the same device multiple times, the owner must take special
- * care to not mess up bd_private for that case.
- */
- unsigned long bd_private;
- /* The counter of freeze processes */
- int bd_fsfreeze_count;
- /* Mutex for freeze */
- struct mutex bd_fsfreeze_mutex;
- };
bd_dev:该设备(分区)的设备号。
bd_inode:指向该设备文件的inode。
bd_openers:一个引用计数,记录了该块设备打开的次数,或者说有多少个进程打开了该设备。
bd_contains:如果该block_device描述的是一个分区,则该变量指向描述主块设备的block_device,反之,其指向本身。
bd_part:如果该block_device描述的是一个分区,则该变量指向分区的信息。
bd_part_count:如果是分区,该变量记录了分区被打开的次数,在进行分区的重新扫描前,要保证该计数值为0。
bd_disk:指向描述整个设备的gendisk结构。
5 buffer_head
在内核层对块设备的IO请求是以块为单位的。buffer_head是一个块在内存中的元数据信息。b_data指向该块数据的实际地址。b_this_page则将通过page中的块连接起来。以前版本的buffer_head是fs到block device的io请求单元,现在已经改为bio了。- struct buffer_head {
- unsigned long b_state; /* buffer state bitmap (see above) */
- struct buffer_head *b_this_page;/* circular list of page's buffers */
- struct page *b_page; /* the page this bh is mapped to */
- sector_t b_blocknr; /* start block number */
- size_t b_size; /* size of mapping */
- char *b_data; /* pointer to data within the page */
- struct block_device *b_bdev;
- bh_end_io_t *b_end_io; /* I/O completion */
- void *b_private; /* reserved for b_end_io */
- struct list_head b_assoc_buffers; /* associated with another mapping */
- struct address_space *b_assoc_map; /* mapping this buffer is
- associated with */
- atomic_t b_count; /* users using this buffer_head */
- };
6 bio
bio封装了一次实际的块设备io请求。这是块设备io请求的基本单位。bi_vcnt表示bio_vec的数目。- struct bio {
- sector_t bi_sector; /* device address in 512 byte
- sectors */
- struct bio *bi_next; /* request queue link */
- struct block_device *bi_bdev;
- unsigned long bi_flags; /* status, command, etc */
- unsigned long bi_rw; /* bottom bits READ/WRITE,
- * top bits priority
- */
- unsigned short bi_vcnt; /* how many bio_vec's */
- unsigned short bi_idx; /* current index into bvl_vec */
- /* Number of segments in this BIO after
- * physical address coalescing is performed.
- */
- unsigned int bi_phys_segments;
- unsigned int bi_size; /* residual I/O count */
- /*
- * To keep track of the max segment size, we account for the
- * sizes of the first and last mergeable segments in this bio.
- */
- unsigned int bi_seg_front_size;
- unsigned int bi_seg_back_size;
- bio_end_io_t *bi_end_io;
- void *bi_private;
- #ifdef CONFIG_BLK_CGROUP
- /*
- * Optional ioc and css associated with this bio. Put on bio
- * release. Read comment on top of bio_associate_current().
- */
- struct io_context *bi_ioc;
- struct cgroup_subsys_state *bi_css;
- #endif
- #if defined(CONFIG_BLK_DEV_INTEGRITY)
- struct bio_integrity_payload *bi_integrity; /* data integrity */
- #endif
- /*
- * Everything starting with bi_max_vecs will be preserved by bio_reset()
- */
- unsigned int bi_max_vecs; /* max bvl_vecs we can hold */
- atomic_t bi_cnt; /* pin count */
- struct bio_vec *bi_io_vec; /* the actual vec list */
- struct bio_set *bi_pool;
- /*
- * We can inline a number of vecs at the end of the bio, to avoid
- * double allocations for a small number of bio_vecs. This member
- * MUST obviously be kept at the very end of the bio.
- */
- struct bio_vec bi_inline_vecs[0];
- };
bi_sector:该I/O操作的起始扇区号。
bi_rw:指明了读写方向。
bi_vcnt:该I/O操作中涉及到了多少个缓存向量,每个缓存向量由[page,offset,len]来描述。
bi_idx:指示当前的缓存向量。
bi_io_vec:缓存向量数组。
7 bio_vec
bio_vec表示一次bio涉及到的数据片段(segment),由所在内存页地址,长度,偏移地址等定位。一次bio一般包含多个segment。- struct bio_vec {
- struct page *bv_page;
- unsigned int bv_len;
- unsigned int bv_offset;
- };
8 request
块设备层IO等待请求(pending I/O request)。内核中的bio请求在经过io调度排序后进入块设备层,会尝试合并到已有的request。bio结构中的bi_next将队列中的bio请求串成一个队列。bio/biotail域指向队列的首尾。- struct request {
- struct list_head queuelist;
- struct call_single_data csd;
- struct request_queue *q;
- unsigned int cmd_flags;
- enum rq_cmd_type_bits cmd_type;
- unsigned long atomic_flags;
- int cpu;
- /* the following two fields are internal, NEVER access directly */
- unsigned int __data_len; /* total data len */
- sector_t __sector; /* sector cursor */
- struct bio *bio;
- struct bio *biotail;
- struct hlist_node hash; /* merge hash */
- /*
- * The rb_node is only used inside the io scheduler, requests
- * are pruned when moved to the dispatch queue. So let the
- * completion_data share space with the rb_node.
- */
- union {
- struct rb_node rb_node; /* sort/lookup */
- void *completion_data;
- };
- /*
- * Three pointers are available for the IO schedulers, if they need
- * more they have to dynamically allocate it. Flush requests are
- * never put on the IO scheduler. So let the flush fields share
- * space with the elevator data.
- */
- union {
- struct {
- struct io_cq *icq;
- void *priv[2];
- } elv;
- struct {
- unsigned int seq;
- struct list_head list;
- rq_end_io_fn *saved_end_io;
- } flush;
- };
- struct gendisk *rq_disk;
- struct hd_struct *part;
- unsigned long start_time;
- #ifdef CONFIG_BLK_CGROUP
- struct request_list *rl; /* rl this rq is alloced from */
- unsigned long long start_time_ns;
- unsigned long long io_start_time_ns; /* when passed to hardware */
- #endif
- /* Number of scatter-gather DMA addr+len pairs after
- * physical address coalescing is performed.
- */
- unsigned short nr_phys_segments;
- #if defined(CONFIG_BLK_DEV_INTEGRITY)
- unsigned short nr_integrity_segments;
- #endif
- unsigned short ioprio;
- int ref_count;
- void *special; /* opaque pointer available for LLD use */
- char *buffer; /* kaddr of the current segment if available */
- int tag;
- int errors;
- /*
- * when request is used as a packet command carrier
- */
- unsigned char __cmd[BLK_MAX_CDB];
- unsigned char *cmd;
- unsigned short cmd_len;
- unsigned int extra_len; /* length of alignment and padding */
- unsigned int sense_len;
- unsigned int resid_len; /* residual count */
- void *sense;
- unsigned long deadline;
- struct list_head timeout_list;
- unsigned int timeout;
- int retries;
- /*
- * completion callback.
- */
- rq_end_io_fn *end_io;
- void *end_io_data;
- /* for bidi */
- struct request *next_rq;
- };
queuelist:用于将request链入请求队列的链表元素。
q:指向所属的请求队列。
__sector:下一个要传输的bio的起始扇区号。
__data_len:request要传输的数据字节数。
bio,biotail:用于维护request中的bio链表。
9 request_queue
request_queue维护块设备层IO请求队列,队列中包含多个request。request_queue同时定义了处理队列的函数接口,不同的设备注册时需要实现这些IO处理接口。- struct request_queue {
- /*
- * Together with queue_head for cacheline sharing
- */
- struct list_head queue_head;
- struct request *last_merge;
- struct elevator_queue *elevator;
- int nr_rqs[2]; /* # allocated [a]sync rqs */
- int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */
- /*
- * If blkcg is not used, @q->root_rl serves all requests. If blkcg
- * is used, root blkg allocates from @q->root_rl and all other
- * blkgs from their own blkg->rl. Which one to use should be
- * determined using bio_request_list().
- */
- struct request_list root_rl;
- request_fn_proc *request_fn;
- make_request_fn *make_request_fn;
- prep_rq_fn *prep_rq_fn;
- unprep_rq_fn *unprep_rq_fn;
- merge_bvec_fn *merge_bvec_fn;
- softirq_done_fn *softirq_done_fn;
- rq_timed_out_fn *rq_timed_out_fn;
- dma_drain_needed_fn *dma_drain_needed;
- lld_busy_fn *lld_busy_fn;
- /*
- * Dispatch queue sorting
- */
- sector_t end_sector;
- struct request *boundary_rq;
- /*
- * Delayed queue handling
- */
- struct delayed_work delay_work;
- struct backing_dev_info backing_dev_info;
- /*
- * The queue owner gets to use this for whatever they like.
- * ll_rw_blk doesn't touch it.
- */
- void *queuedata;
- /*
- * various queue flags, see QUEUE_* below
- */
- unsigned long queue_flags;
- /*
- * ida allocated id for this queue. Used to index queues from
- * ioctx.
- */
- int id;
- /*
- * queue needs bounce pages for pages above this limit
- */
- gfp_t bounce_gfp;
- /*
- * protects queue structures from reentrancy. ->__queue_lock should
- * _never_ be used directly, it is queue private. always use
- * ->queue_lock.
- */
- spinlock_t __queue_lock;
- spinlock_t *queue_lock;
- /*
- * queue kobject
- */
- struct kobject kobj;
- #ifdef CONFIG_PM_RUNTIME
- struct device *dev;
- int rpm_status;
- unsigned int nr_pending;
- #endif
- /*
- * queue settings
- */
- unsigned long nr_requests; /* Max # of requests */
- unsigned int nr_congestion_on;
- unsigned int nr_congestion_off;
- unsigned int nr_batching;
- unsigned int dma_drain_size;
- void *dma_drain_buffer;
- unsigned int dma_pad_mask;
- unsigned int dma_alignment;
- struct blk_queue_tag *queue_tags;
- struct list_head tag_busy_list;
- unsigned int nr_sorted;
- unsigned int in_flight[2];
- /*
- * Number of active block driver functions for which blk_drain_queue()
- * must wait. Must be incremented around functions that unlock the
- * queue_lock internally, e.g. scsi_request_fn().
- */
- unsigned int request_fn_active;
- unsigned int rq_timeout;
- struct timer_list timeout;
- struct list_head timeout_list;
- struct list_head icq_list;
- #ifdef CONFIG_BLK_CGROUP
- DECLARE_BITMAP (blkcg_pols, BLKCG_MAX_POLS);
- struct blkcg_gq *root_blkg;
- struct list_head blkg_list;
- #endif
- struct queue_limits limits;
- /*
- * sg stuff
- */
- unsigned int sg_timeout;
- unsigned int sg_reserved_size;
- int node;
- #ifdef CONFIG_BLK_DEV_IO_TRACE
- struct blk_trace *blk_trace;
- #endif
- /*
- * for flush operations
- */
- unsigned int flush_flags;
- unsigned int flush_not_queueable:1;
- unsigned int flush_queue_delayed:1;
- unsigned int flush_pending_idx:1;
- unsigned int flush_running_idx:1;
- unsigned long flush_pending_since;
- struct list_head flush_queue[2];
- struct list_head flush_data_in_flight;
- struct request flush_rq;
- struct mutex sysfs_lock;
- int bypass_depth;
- #if defined(CONFIG_BLK_DEV_BSG)
- bsg_job_fn *bsg_job_fn;
- int bsg_job_size;
- struct bsg_class_device bsg_dev;
- #endif
- #ifdef CONFIG_BLK_CGROUP
- struct list_head all_q_node;
- #endif
- #ifdef CONFIG_BLK_DEV_THROTTLING
- /* Throttle data */
- struct throtl_data *td;
- #endif
- struct rcu_head rcu_head;
- };
二 主要函数
1 submit_bh
submit_bh是内核发送IO请求给块设备的函数,目前较新版本的内核中该函数会调用submit_bio执行实际请求。- int submit_bh(int rw, struct buffer_head *bh)
- {
- return _submit_bh(rw, bh, 0);
- }
- int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
- {
- struct bio *bio;
- int ret = 0;
- BUG_ON(!buffer_locked(bh));
- BUG_ON(!buffer_mapped(bh));
- BUG_ON(!bh->b_end_io);
- BUG_ON(buffer_delay(bh));
- BUG_ON(buffer_unwritten(bh));
- /*
- * Only clear out a write error when rewriting
- */
- if (test_set_buffer_req(bh) && (rw & WRITE))
- clear_buffer_write_io_error(bh);
- /*
- * from here on down, it's all bio -- do the initial mapping,
- * submit_bio -> generic_make_request may further map this bio around
- */
- bio = bio_alloc(GFP_NOIO, 1);
- bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
- bio->bi_bdev = bh->b_bdev;
- bio->bi_io_vec[0].bv_page = bh->b_page;
- bio->bi_io_vec[0].bv_len = bh->b_size;
- bio->bi_io_vec[0].bv_offset = bh_offset(bh);
- bio->bi_vcnt = 1;
- bio->bi_size = bh->b_size;
- bio->bi_end_io = end_bio_bh_io_sync;
- bio->bi_private = bh;
- bio->bi_flags |= bio_flags;
- /* Take care of bh's that straddle the end of the device */
- guard_bh_eod(rw, bio, bh);
- if (buffer_meta(bh))
- rw |= REQ_META;
- if (buffer_prio(bh))
- rw |= REQ_PRIO;
- bio_get(bio);
- submit_bio(rw, bio);
- if (bio_flagged(bio, BIO_EOPNOTSUPP))
- ret = -EOPNOTSUPP;
- bio_put(bio);
- return ret;
- }
2 submit_bio
submit_bio函数会调用generic_make_request执行实际的bio请求。- void submit_bio(int rw, struct bio *bio)
- {
- bio->bi_rw |= rw;
- /*
- * If it's a regular read/write or a barrier with data attached,
- * go through the normal accounting stuff before submission.
- */
- if (bio_has_data(bio)) {
- unsigned int count;
- if (unlikely(rw & REQ_WRITE_SAME))
- count = bdev_logical_block_size(bio->bi_bdev) >> 9;
- else
- count = bio_sectors(bio);
- if (rw & WRITE) {
- count_vm_events(PGPGOUT, count);
- } else {
- task_io_account_read(bio->bi_size);
- count_vm_events(PGPGIN, count);
- }
- if (unlikely(block_dump)) {
- char b[BDEVNAME_SIZE];
- printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
- current->comm, task_pid_nr(current),
- (rw & WRITE) ? "WRITE" : "READ",
- (unsigned long long)bio->bi_sector,
- bdevname(bio->bi_bdev, b),
- count);
- }
- }
- generic_make_request(bio);
- }
generic_make_request则循环处理bio链表。最终调用request_queue中的make_request_fn处理函数处理实际的IO请求。
- void generic_make_request(struct bio *bio)
- {
- struct bio_list bio_list_on_stack;
- if (!generic_make_request_checks(bio))
- return;
- /*
- * We only want one ->make_request_fn to be active at a time, else
- * stack usage with stacked devices could be a problem. So use
- * current->bio_list to keep a list of requests submited by a
- * make_request_fn function. current->bio_list is also used as a
- * flag to say if generic_make_request is currently active in this
- * task or not. If it is NULL, then no make_request is active. If
- * it is non-NULL, then a make_request is active, and new requests
- * should be added at the tail
- */
- if (current->bio_list) {
- bio_list_add(current->bio_list, bio);
- return;
- }
- /* following loop may be a bit non-obvious, and so deserves some
- * explanation.
- * Before entering the loop, bio->bi_next is NULL (as all callers
- * ensure that) so we have a list with a single bio.
- * We pretend that we have just taken it off a longer list, so
- * we assign bio_list to a pointer to the bio_list_on_stack,
- * thus initialising the bio_list of new bios to be
- * added. ->make_request() may indeed add some more bios
- * through a recursive call to generic_make_request. If it
- * did, we find a non-NULL value in bio_list and re-enter the loop
- * from the top. In this case we really did just take the bio
- * of the top of the list (no pretending) and so remove it from
- * bio_list, and call into ->make_request() again.
- */
- BUG_ON(bio->bi_next);
- bio_list_init(&bio_list_on_stack);
- current->bio_list = &bio_list_on_stack;
- do {
- struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- q->make_request_fn(q, bio);
- bio = bio_list_pop(current->bio_list);
- } while (bio);
- current->bio_list = NULL; /* deactivate */
- }
4 make_request_fn
make_request_fn是于具体block设备相关的函数,mmc子系统初始化为blk_queue_bio。mmc_blk_probe()->mmc_blk_alloc()->mmc_blk_alloc_req()->mmc_init_queue()->blk_init_queue()->blk_init_queue_node()->blk_init_allocated_queue()->blk_queue_make_request(q, blk_queue_bio)。这里就用到了IO调度算法,新的bio可能被合并到request_queue中已有的request结构中(甚至合并到已有的bio中),或者生成新的request,经过这里就bio就变成了request。- void blk_queue_bio(struct request_queue *q, struct bio *bio)
- {
- const bool sync = !!(bio->bi_rw & REQ_SYNC);
- struct blk_plug *plug;
- int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
- struct request *req;
- unsigned int request_count = 0;
- /*
- * low level driver can indicate that it wants pages above a
- * certain limit bounced to low memory (ie for highmem, or even
- * ISA dma in theory)
- */
- blk_queue_bounce(q, &bio);
- if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
- bio_endio(bio, -EIO);
- return;
- }
- if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
- spin_lock_irq(q->queue_lock);
- where = ELEVATOR_INSERT_FLUSH;
- goto get_rq;
- }
- /*
- * Check if we can merge with the plugged list before grabbing
- * any locks.
- */
- if (attempt_plug_merge(q, bio, &request_count))
- return;
- spin_lock_irq(q->queue_lock);
- el_ret = elv_merge(q, &req, bio);
- if (el_ret == ELEVATOR_BACK_MERGE) {
- if (bio_attempt_back_merge(q, req, bio)) {
- elv_bio_merged(q, req, bio);
- if (!attempt_back_merge(q, req))
- elv_merged_request(q, req, el_ret);
- goto out_unlock;
- }
- } else if (el_ret == ELEVATOR_FRONT_MERGE) {
- if (bio_attempt_front_merge(q, req, bio)) {
- elv_bio_merged(q, req, bio);
- if (!attempt_front_merge(q, req))
- elv_merged_request(q, req, el_ret);
- goto out_unlock;
- }
- }
- get_rq:
- /*
- * This sync check and mask will be re-done in init_request_from_bio(),
- * but we need to set it earlier to expose the sync flag to the
- * rq allocator and io schedulers.
- */
- rw_flags = bio_data_dir(bio);
- if (sync)
- rw_flags |= REQ_SYNC;
- /*
- * Grab a free request. This is might sleep but can not fail.
- * Returns with the queue unlocked.
- */
- req = get_request(q, rw_flags, bio, GFP_NOIO);
- if (unlikely(!req)) {
- bio_endio(bio, -ENODEV); /* @q is dead */
- goto out_unlock;
- }
- /*
- * After dropping the lock and possibly sleeping here, our request
- * may now be mergeable after it had proven unmergeable (above).
- * We don't worry about that case for efficiency. It won't happen
- * often, and the elevators are able to handle it.
- */
- init_request_from_bio(req, bio);
- if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
- req->cpu = raw_smp_processor_id();
- plug = current->plug;
- if (plug) {
- /*
- * If this is the first request added after a plug, fire
- * of a plug trace. If others have been added before, check
- * if we have multiple devices in this plug. If so, make a
- * note to sort the list before dispatch.
- */
- if (list_empty(&plug->list))
- trace_block_plug(q);
- else {
- if (request_count >= BLK_MAX_REQUEST_COUNT) {
- blk_flush_plug_list(plug, false);
- trace_block_plug(q);
- }
- }
- list_add_tail(&req->queuelist, &plug->list);
- drive_stat_acct(req, 1);
- } else {
- spin_lock_irq(q->queue_lock);
- add_acct_request(q, req, where);
- __blk_run_queue(q);
- out_unlock:
- spin_unlock_irq(q->queue_lock);
- }
- }
5 blk_init_queue
- struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
- {
- return blk_init_queue_node(rfn, lock, NUMA_NO_NODE);
- }
分配和初始化请求队列,q->request_fn = mmc_request_fn和blk_queue_make_request(q, blk_queue_bio)就是在这里初始化的。
三 块设备驱动开发
1 分配主设备号
- int register_blkdev(unsigned int major, const char *name)
- {
- struct blk_major_name **n, *p;
- int index, ret = 0;
- mutex_lock(&block_class_lock);
- /* temporary */
- if (major == 0) {
- for (index = ARRAY_SIZE(major_names)-1; index > 0; index--) {
- if (major_names[index] == NULL)
- break;
- }
- if (index == 0) {
- printk("register_blkdev: failed to get major for %s\n",
- name);
- ret = -EBUSY;
- goto out;
- }
- major = index;
- ret = major;
- }
- p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL);
- if (p == NULL) {
- ret = -ENOMEM;
- goto out;
- }
- p->major = major;
- strlcpy(p->name, name, sizeof(p->name));
- p->next = NULL;
- index = major_to_index(major);
- for (n = &major_names[index]; *n; n = &(*n)->next) {
- if ((*n)->major == major)
- break;
- }
- if (!*n)
- *n = p;
- else
- ret = -EBUSY;
- if (ret < 0) {
- printk("register_blkdev: cannot get major %d for %s\n",
- major, name);
- kfree(p);
- }
- out:
- mutex_unlock(&block_class_lock);
- return ret;
- }
MMC_BLOCK_MAJOR是内核预留给mmc block设备的主设备号;我们自己开发的block设备没有预留的主设备号,通常都把major设置0,这样内核会给我们分配一个没被占用的设备号。怎么找到的呢?其实这个设备号动态生成的规则非常简单,内核维护了一个类型为struct blk_major_name的hash表major_names。
- struct blk_major_name的hash表major_names。
- static struct blk_major_name {
- struct blk_major_name *next;
- int major;
- char name[16];
- } *major_names[BLKDEV_MAJOR_HASH_SIZE];
2 创建和初始化gendisk结构
- struct gendisk alloc_disk(int minors)
gendisk数据结构创建好后,可以自行设置major,first_minor,fops,queue等成员,并通过set_capacity来设置驱动支持的最大扇区数。
md->disk->major = MMC_BLOCK_MAJOR;
md->disk->first_minor = devidx * perdev_minors;
md->disk->fops = &mmc_bdops;
md->disk->private_data = md;
md->disk->queue = md->queue.queue;
set_capacity(md->disk, size);这里单独再说下关于queue成员的创建,request_queue的创建主要要初始化它的2个重要的成员函数,分别是make_reuqest_fn/request_fn。
blk_queue_make_request(q, blk_queue_bio);//blk_queue_bio
mq->queue = blk_init_queue(mmc_request_fn, lock);//mmc_request_fn
3 向内核添加磁盘
- void add_disk(struct gendisk *disk)
- {
- struct backing_dev_info *bdi;
- dev_t devt;
- int retval;
- /* minors == 0 indicates to use ext devt from part0 and should
- * be accompanied with EXT_DEVT flag. Make sure all
- * parameters make sense.
- */
- WARN_ON(disk->minors && !(disk->major || disk->first_minor));
- WARN_ON(!disk->minors && !(disk->flags & GENHD_FL_EXT_DEVT));
- disk->flags |= GENHD_FL_UP;
- retval = blk_alloc_devt(&disk->part0, &devt);
- if (retval) {
- WARN_ON(1);
- return;
- }
- disk_to_dev(disk)->devt = devt;
- /* ->major and ->first_minor aren't supposed to be
- * dereferenced from here on, but set them just in case.
- */
- disk->major = MAJOR(devt);
- disk->first_minor = MINOR(devt);
- disk_alloc_events(disk);
- /* Register BDI before referencing it from bdev */
- bdi = &disk->queue->backing_dev_info;
- bdi_register_dev(bdi, disk_devt(disk));
- blk_register_region(disk_devt(disk), disk->minors, NULL,
- exact_match, exact_lock, disk);
- register_disk(disk);
- blk_register_queue(disk);
- /*
- * Take an extra ref on queue which will be put on disk_release()
- * so that it sticks around as long as @disk is there.
- */
- WARN_ON_ONCE(!blk_get_queue(disk->queue));
- retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
- "bdi");
- WARN_ON(retval);
- disk_add_events(disk);
- }
四 IO调度
IO调度算法是一种电梯算法(elevator algorithm),目前主要有cfq/deadline/anticipatory/noop,其中cfq是Linux的默认策略;anticipatory在新的内核中已经放弃;deadline在大部分OLTP数据库应用中更具优势,IO的响应时间更稳定些;noop只对IO请求进行简单的合并,其他不干涉,类似与先来先服务;在FusionIO等IO性能很好的设备上,noop反而更具优势,所以FusionIO的驱动默认使用了noop。
代码位于block目录,block/elevator.c和include/linux/elevator.h,算法block/deadline-iosched.c。
1 主要数据结构
- struct elevator_type
- {
- /* managed by elevator core */
- struct kmem_cache *icq_cache;
- /* fields provided by elevator implementation */
- struct elevator_ops ops;
- size_t icq_size; /* see iocontext.h */
- size_t icq_align; /* ditto */
- struct elv_fs_entry *elevator_attrs;
- char elevator_name[ELV_NAME_MAX];
- struct module *elevator_owner;
- /* managed by elevator core */
- char icq_cache_name[ELV_NAME_MAX + 5]; /* elvname + "_io_cq" */
- struct list_head list;
- };
- struct elevator_queue
- {
- struct elevator_type *type;
- void *elevator_data;
- struct kobject kobj;
- struct mutex sysfs_lock;
- unsigned int registered:1;
- DECLARE_HASHTABLE(hash, ELV_HASH_BITS);
- };
elevator_type对应一个调度器类型,elevator_queue对应一个调度器实例,如果内核中只有上述四种类型的调度器,则只有四个elevator_type;但是多个块设备(分区)可拥有多个相应调度器的实例,也就是elevator_queue。两个数据结构中最关键的元素都是struct elevator_ops,该结构定义了一组操作函数,用来描述请求队列的相关算法,实现对请求的处理。
- struct elevator_ops
- {
- elevator_merge_fn *elevator_merge_fn;
- elevator_merged_fn *elevator_merged_fn;
- elevator_merge_req_fn *elevator_merge_req_fn;
- elevator_allow_merge_fn *elevator_allow_merge_fn;
- elevator_bio_merged_fn *elevator_bio_merged_fn;
- elevator_dispatch_fn *elevator_dispatch_fn;
- elevator_add_req_fn *elevator_add_req_fn;
- elevator_activate_req_fn *elevator_activate_req_fn;
- elevator_deactivate_req_fn *elevator_deactivate_req_fn;
- elevator_completed_req_fn *elevator_completed_req_fn;
- elevator_request_list_fn *elevator_former_req_fn;
- elevator_request_list_fn *elevator_latter_req_fn;
- elevator_init_icq_fn *elevator_init_icq_fn; /* see iocontext.h */
- elevator_exit_icq_fn *elevator_exit_icq_fn; /* ditto */
- elevator_set_req_fn *elevator_set_req_fn;
- elevator_put_req_fn *elevator_put_req_fn;
- elevator_may_queue_fn *elevator_may_queue_fn;
- elevator_init_fn *elevator_init_fn;
- elevator_exit_fn *elevator_exit_fn;
- };
elevator_merge_req_fn将两个合并后的请求中多余的那个给删除。
elevator_dispatch_fn将调度器的队列最前面的元素取出,分派给request_queue中的请求队列以等候响应。
elevator_add_req_fn将一个新的request添加进调度器的队列。
elevator_set_req_fn和elevator_put_req_fn分别在创建新请求和将请求所占的空间释放到内存时调用。
elevator_init_fn用于初始化调度器实例。
2 elevator_init分配IO调度器elevator_queue
例如:mmc_blk_probe()->mmc_blk_alloc()->mmc_blk_alloc_req()->mmc_init_queue()->blk_init_queue()->blk_init_queue_node()->blk_init_allocated_queue()->elevator_init(q, NULL)
- int elevator_init(struct request_queue *q, char *name)
- {
- struct elevator_type *e = NULL;
- int err;
- if (unlikely(q->elevator))
- return 0;
- INIT_LIST_HEAD(&q->queue_head);
- q->last_merge = NULL;
- q->end_sector = 0;
- q->boundary_rq = NULL;
- if (name) {
- e = elevator_get(name, true);
- if (!e)
- return -EINVAL;
- }
- /*
- * Use the default elevator specified by config boot param or
- * config option. Don't try to load modules as we could be running
- * off async and request_module() isn't allowed from async.
- */
- if (!e && *chosen_elevator) {
- e = elevator_get(chosen_elevator, false);
- if (!e)
- printk(KERN_ERR "I/O scheduler %s not found\n",
- chosen_elevator);
- }
- if (!e) {
- e = elevator_get(CONFIG_DEFAULT_IOSCHED, false);
- if (!e) {
- printk(KERN_ERR
- "Default I/O scheduler not found. " \
- "Using noop.\n");
- e = elevator_get("noop", false);
- }
- }
- err = e->ops.elevator_init_fn(q, e);
- return 0;
- }
(1) 如果name指定了elevator_type。
用e = elevator_get(name, true);来找一个elevator_type。为什么能找到呢?看一下elevator_get()。
- static struct elevator_type *elevator_get(const char *name, bool try_loading)
- {
- struct elevator_type *e;
- spin_lock(&elv_list_lock);
- e = elevator_find(name);
- if (!e && try_loading) {
- spin_unlock(&elv_list_lock);
- request_module("%s-iosched", name);
- spin_lock(&elv_list_lock);
- e = elevator_find(name);
- }
- if (e && !try_module_get(e->elevator_owner))
- e = NULL;
- spin_unlock(&elv_list_lock);
- return e;
- }
- static struct elevator_type *elevator_find(const char *name)
- {
- struct elevator_type *e;
- list_for_each_entry(e, &elv_list, list) {
- if (!strcmp(e->elevator_name, name))
- return e;
- }
- return NULL;
- }
deadline_init()->elv_register()->list_add_tail(&e->list, &elv_list)。
(2) name没有指定,用*chosen_elevator指定的elevator_type。
chosen_elevator存的也是elevator_type的name,是什么时候设置的呢?- static int __init elevator_setup(char *str)
- {
- /*
- * Be backwards-compatible with previous kernels, so users
- * won't get the wrong elevator.
- */
- strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
- return 1;
- }
- __setup("elevator=", elevator_setup);
__setup宏最终会定义一个obs_kernel_param类型的结构,并放在.init.setup段。
- struct obs_kernel_param {
- const char *str;
- int (*setup_func)(char *);
- int early;
- };
static const char __setup_str_elevator_setup[] __initconst
__aligned(1) = "elevator=";
static struct obs_kernel_param __setup_elevator_setup
__used __section(.init.setup)
__attribute__((aligned((sizeof(long)))))
= { __setup_str_elevator_setup, elevator_setup, 0}
start_kernel()->parse_args()->unknown_bootoption()->obsolete_checksetup()->(p->setup_func(line + n))
kernel处理启动参数时,将启动参数和__setup_start和__setup_end(就是.init.setup段的内容)之间的结构比较,
include/asm-generic/vmlinux.lds.h
- include/asm-generic/vmlinux.lds.h
- #define INIT_SETUP(initsetup_align) \
- . = ALIGN(initsetup_align); \
- VMLINUX_SYMBOL(__setup_start) = .; \
- *(.init.setup) \
- VMLINUX_SYMBOL(__setup_end) = .;
early_param宏和__setup宏差不多,只是early=1。
- #define __setup(str, fn) \
- __setup_param(str, fn, fn, 0)
- /* NOTE: fn is as per module_param, not __setup! Emits warning if fn
- * returns non-zero. */
- #define early_param(str, fn) \
- __setup_param(str, fn, fn, 1)
do_early_param()中会判断p->early为1才会执行p->setup_func(val)。
(3) 前两个都没有指定,用CONFIG_DEFAULT_IOSCHED默认配置的elevator_type。
(4) 如果走到这里,那就只能用默认的noop调度器了。
(5) err = e->ops.elevator_init_fn(q, e)。
以deadline为例.elevator_init_fn = deadline_init_queue,
- static int deadline_init_queue(struct request_queue *q, struct elevator_type *e)
- {
- struct deadline_data *dd;
- struct elevator_queue *eq;
- eq = elevator_alloc(q, e);
- if (!eq)
- return -ENOMEM;
- dd = kmalloc_node(sizeof(*dd), GFP_KERNEL | __GFP_ZERO, q->node);
- if (!dd) {
- kobject_put(&eq->kobj);
- return -ENOMEM;
- }
- eq->elevator_data = dd;
- INIT_LIST_HEAD(&dd->fifo_list[READ]);
- INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
- dd->sort_list[READ] = RB_ROOT;
- dd->sort_list[WRITE] = RB_ROOT;
- dd->fifo_expire[READ] = read_expire;
- dd->fifo_expire[WRITE] = write_expire;
- dd->writes_starved = writes_starved;
- dd->front_merges = 1;
- dd->fifo_batch = fifo_batch;
- spin_lock_irq(q->queue_lock);
- q->elevator = eq;
- spin_unlock_irq(q->queue_lock);
- return 0;
- }
(b) 分配和初始化deadline调度器的私有数据deadline_data。
(c) request_queue(q)、elevator_queue(eq)、deadline_data(dd)之间的关联,
eq->elevator_data = dd;
q->elevator = eq;