Linux那些事儿之我是Block层(7)scsi命令的前世今生(一)

本文详细解析了在Linux内核中SCSI命令如何转换为Block层的Request对象，并介绍了关键函数如scsi_execute()、blk_get_request()等的工作原理。

现在我们块设备也有了,队列也有了,要提交请求也就可以开始提交了.那就让我们来研究一下如何提交请求如何处理请求吧.不过哥们儿有言在先,出错处理的那些乱七八糟的代码咱们就不理睬了.

仍然以scsi磁盘举例,最初scsi这边发送的是scsi命令,可是从block走就得变成request,然而走到usb-storage那边又得变回scsi命令.换言之,这整个过程scsi命令要变两次身.

首先让我们从sd那边很常用的一个函数开始,我们来看scsi命令是如何在光天化日之下被偷梁换柱的变成了request,这个函数就是scsi_execute_req().来自drivers/scsi/scsi_lib.c:

216 int scsi_execute_req(struct scsi_device *sdev, const unsigned char *cmd,

217 int data_direction, void *buffer, unsigned bufflen,

218 struct scsi_sense_hdr *sshdr, int timeout, int retries)

219 {

220 char *sense = NULL;

221 int result;

222

223 if (sshdr) {

224 sense = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_NOIO);

225 if (!sense)

226 return DRIVER_ERROR << 24;

227 }

228 result = scsi_execute(sdev, cmd, data_direction, buffer, bufflen,

229 sense, timeout, retries, 0);

230 if (sshdr)

231 scsi_normalize_sense(sense, SCSI_SENSE_BUFFERSIZE, sshdr);

232

233 kfree(sense);

234 return result;

235 }

这里面最需要关注的就是一个函数,scsi_execute(),来自同一个文件.

164 /**

165 * scsi_execute - insert request and wait for the result

166 * @sdev: scsi device

167 * @cmd: scsi command

168 * @data_direction: data direction

169 * @buffer: data buffer

170 * @bufflen: len of buffer

171 * @sense: optional sense buffer

172 * @timeout: request timeout in seconds

173 * @retries: number of times to retry request

174 * @flags: or into request flags;

175 *

176 * returns the req->errors value which is the scsi_cmnd result

177 * field.

178 **/

179 int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,

180 int data_direction, void *buffer, unsigned bufflen,

181 unsigned char *sense, int timeout, int retries, int flags)

182 {

183 struct request *req;

184 int write = (data_direction == DMA_TO_DEVICE);

185 int ret = DRIVER_ERROR << 24;

186

187 req = blk_get_request(sdev->request_queue, write, __GFP_WAIT);

188

189 if (bufflen && blk_rq_map_kern(sdev->request_queue, req,

190 buffer, bufflen, __GFP_WAIT))

191 goto out;

192

193 req->cmd_len = COMMAND_SIZE(cmd[0]);

194 memcpy(req->cmd, cmd, req->cmd_len);

195 req->sense = sense;

196 req->sense_len = 0;

197 req->retries = retries;

198 req->timeout = timeout;

199 req->cmd_type = REQ_TYPE_BLOCK_PC;

200 req->cmd_flags |= flags | REQ_QUIET | REQ_PREEMPT;

201

202 /*

203 * head injection *required* here otherwise quiesce won't work

204 */

205 blk_execute_rq(req->q, NULL, req, 1);

206

207 ret = req->errors;

208 out:

209 blk_put_request(req);

210

211 return ret;

212 }

首先被调用的是blk_get_request.来自block/ll_rw_blk.c:

2215 struct request *blk_get_request(request_queue_t *q, int rw, gfp_t gfp_mask)

2216 {

2217 struct request *rq;

2218

2219 BUG_ON(rw != READ && rw != WRITE);

2220

2221 spin_lock_irq(q->queue_lock);

2222 if (gfp_mask & __GFP_WAIT) {

2223 rq = get_request_wait(q, rw, NULL);

2224 } else {

2225 rq = get_request(q, rw, NULL, gfp_mask);

2226 if (!rq)

2227 spin_unlock_irq(q->queue_lock);

2228 }

2229 /* q->queue_lock is unlocked at this point */

2230

2231 return rq;

2232 }

注意到我们调用这个函数的时候,第二个参数确实是__GFP_WAIT.所以2223行会被执行.get_request_wait()来自同一个文件:

2173 static struct request *get_request_wait(request_queue_t *q, int rw_flags,

2174 struct bio *bio)

2175 {

2176 const int rw = rw_flags & 0x01;

2177 struct request *rq;

2178

2179 rq = get_request(q, rw_flags, bio, GFP_NOIO);

2180 while (!rq) {

2181 DEFINE_WAIT(wait);

2182 struct request_list *rl = &q->rq;

2183

2184 prepare_to_wait_exclusive(&rl->wait[rw], &wait,

2185 TASK_UNINTERRUPTIBLE);

2186

2187 rq = get_request(q, rw_flags, bio, GFP_NOIO);

2188

2189 if (!rq) {

2190 struct io_context *ioc;

2191

2192 blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);

2193

2194 __generic_unplug_device(q);

2195 spin_unlock_irq(q->queue_lock);

2196 io_schedule();

2197

2198 /*

2199 * After sleeping, we become a "batching" process and

2200 * will be able to allocate at least one request, and

2201 * up to a big batch of them for a small period time.

2202 * See ioc_batching, ioc_set_batching

2203 */

2204 ioc = current_io_context(GFP_NOIO, q->node);

2205 ioc_set_batching(q, ioc);

2206

2207 spin_lock_irq(q->queue_lock);

2208 }

2209 finish_wait(&rl->wait[rw], &wait);

2210 }

2211

2212 return rq;

2213 }

而真正被调用的又是get_request(),仍然是来自同一个文件.

2063 /*

2064 * Get a free request, queue_lock must be held.

2065 * Returns NULL on failure, with queue_lock held.

2066 * Returns !NULL on success, with queue_lock *not held*.

2067 */

2068 static struct request *get_request(request_queue_t *q, int rw_flags,

2069 struct bio *bio, gfp_t gfp_mask)

2070 {

2071 struct request *rq = NULL;

2072 struct request_list *rl = &q->rq;

2073 struct io_context *ioc = NULL;

2074 const int rw = rw_flags & 0x01;

2075 int may_queue, priv;

2076

2077 may_queue = elv_may_queue(q, rw_flags);

2078 if (may_queue == ELV_MQUEUE_NO)

2079 goto rq_starved;

2080

2081 if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) {

2082 if (rl->count[rw]+1 >= q->nr_requests) {

2083 ioc = current_io_context(GFP_ATOMIC, q->node);

2084 /*

2085 * The queue will fill after this allocation, so set

2086 * it as full, and mark this process as "batching".

2087 * This process will be allowed to complete a batch of

2088 * requests, others will be blocked.

2089 */

2090 if (!blk_queue_full(q, rw)) {

2091 ioc_set_batching(q, ioc);

2092 blk_set_queue_full(q, rw);

2093 } else {

2094 if (may_queue != ELV_MQUEUE_MUST

2095 && !ioc_batching(q, ioc)) {

2096 /*

2097 * The queue is full and the allocating

2098 * process is not a "batcher", and not

2099 * exempted by the IO scheduler

2100 */

2101 goto out;

2102 }

2103 }

2104 }

2105 blk_set_queue_congested(q, rw);

2106 }

2107

2108 /*

2109 * Only allow batching queuers to allocate up to 50% over the defined

2110 * limit of requests, otherwise we could have thousands of requests

2111 * allocated with any setting of ->nr_requests

2112 */

2113 if (rl->count[rw] >= (3 * q->nr_requests / 2))

2114 goto out;

2115

2116 rl->count[rw]++;

2117 rl->starved[rw] = 0;

2118

2119 priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);

2120 if (priv)

2121 rl->elvpriv++;

2122

2123 spin_unlock_irq(q->queue_lock);

2124

2125 rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);

2126 if (unlikely(!rq)) {

2127 /*

2128 * Allocation failed presumably due to memory. Undo anything

2129 * we might have messed up.

2130 *

2131 * Allocating task should really be put onto the front of the

2132 * wait queue, but this is pretty rare.

2133 */

2134 spin_lock_irq(q->queue_lock);

2135 freed_request(q, rw, priv);

2136

2137 /*

2138 * in the very unlikely event that allocation failed and no

2139 * requests for this direction was pending, mark us starved

2140 * so that freeing of a request in the other direction will

2141 * notice us. another possible fix would be to split the

2142 * rq mempool into READ and WRITE

2143 */

2144 rq_starved:

2145 if (unlikely(rl->count[rw] == 0))

2146 rl->starved[rw] = 1;

2147

2148 goto out;

2149 }

2150

2151 /*

2152 * ioc may be NULL here, and ioc_batching will be false. That's

2153 * OK, if the queue is under the request limit then requests need

2154 * not count toward the nr_batch_requests limit. There will always

2155 * be some limit enforced by BLK_BATCH_TIME.

2156 */

2157 if (ioc_batching(q, ioc))

2158 ioc->nr_batch_requests--;

2159

2160 rq_init(q, rq);

2161

2162 blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);

2163 out:

2164 return rq;

2165 }

这个elv_may_queue来自block/elevator.c:

848 int elv_may_queue(request_queue_t *q, int rw)

849 {

850 elevator_t *e = q->elevator;

851

852 if (e->ops->elevator_may_queue_fn)

853 return e->ops->elevator_may_queue_fn(q, rw);

854

855 return ELV_MQUEUE_MAY;

856 }

属于我们的那个elevator_t结构体变量是当初我们在elevator_init()中调用elevator_alloc()申请的.它的ops显然是和具体我们采用了哪种电梯有关系的.这里我们为了简便起见,做一个最不要脸的选择,选择”noop”,这种最简单最原始的机制.再一次贴出它的elevator_type.

87 static struct elevator_type elevator_noop = {

88 .ops = {

89 .elevator_merge_req_fn = noop_merged_requests,

90 .elevator_dispatch_fn = noop_dispatch,

91 .elevator_add_req_fn = noop_add_request,

92 .elevator_queue_empty_fn = noop_queue_empty,

93 .elevator_former_req_fn = noop_former_request,

94 .elevator_latter_req_fn = noop_latter_request,

95 .elevator_init_fn = noop_init_queue,

96 .elevator_exit_fn = noop_exit_queue,

97 },

98 .elevator_name = "noop",

99 .elevator_owner = THIS_MODULE,

100 };

是不是觉得很开心. 对于我们选择的这种noop的电梯,elevator_may_queue_fn根本就没有定义哎.虽然我们这样做很无耻,但是谁叫我们不幸生在现在的中国呢?只要我们够作践,够胆大,够无耻,够疯狂,所谓的道德底线不是”大底”,重心可以下移,完全有向下突破的机会.

带着一个返回值ELV_MQUEUE_MAY,我们返回到get_request()中来.rl又是什么呢?2072行我们让它指向了q->rq.在这样一个危急关头,我不得不搬出一个复杂的结构体了,它就是request_queue,或者叫request_queue_t,定义于include/linux/blkdev.h:

38 struct request_queue;

39 typedef struct request_queue request_queue_t;

360 struct request_queue

361 {

362 /*

363 * Together with queue_head for cacheline sharing

364 */

365 struct list_head queue_head;

366 struct request *last_merge;

367 elevator_t *elevator;

368

369 /*

370 * the queue request freelist, one for reads and one for writes

371 */

372 struct request_list rq;

373

374 request_fn_proc *request_fn;

375 make_request_fn *make_request_fn;

376 prep_rq_fn *prep_rq_fn;

377 unplug_fn *unplug_fn;

378 merge_bvec_fn *merge_bvec_fn;

379 issue_flush_fn *issue_flush_fn;

380 prepare_flush_fn *prepare_flush_fn;

381 softirq_done_fn *softirq_done_fn;

382

383 /*

384 * Dispatch queue sorting

385 */

386 sector_t end_sector;

387 struct request *boundary_rq;

388

389 /*

390 * Auto-unplugging state

391 */

392 struct timer_list unplug_timer;

393 int unplug_thresh; /* After this many requests */

394 unsigned long unplug_delay; /* After this many jiffies */

395 struct work_struct unplug_work;

396

397 struct backing_dev_info backing_dev_info;

398

399 /*

400 * The queue owner gets to use this for whatever they like.

401 * ll_rw_blk doesn't touch it.

402 */

403 void *queuedata;

404

405 /*

406 * queue needs bounce pages for pages above this limit

407 */

408 unsigned long bounce_pfn;

409 gfp_t bounce_gfp;

410

411 /*

412 * various queue flags, see QUEUE_* below

413 */

414 unsigned long queue_flags;

415

416 /*

417 * protects queue structures from reentrancy. ->__queue_lock should

418 * _never_ be used directly, it is queue private. always use

419 * ->queue_lock.

420 */

421 spinlock_t __queue_lock;

422 spinlock_t *queue_lock;

423

424 /*

425 * queue kobject

426 */

427 struct kobject kobj;

428

429 /*

430 * queue settings

431 */

432 unsigned long nr_requests; /* Max # of requests */

433 unsigned int nr_congestion_on;

434 unsigned int nr_congestion_off;

435 unsigned int nr_batching;

436

437 unsigned int max_sectors;

438 unsigned int max_hw_sectors;

439 unsigned short max_phys_segments;

440 unsigned short max_hw_segments;

441 unsigned short hardsect_size;

442 unsigned int max_segment_size;

443

444 unsigned long seg_boundary_mask;

445 unsigned int dma_alignment;

446

447 struct blk_queue_tag *queue_tags;

448

449 unsigned int nr_sorted;

450 unsigned int in_flight;

451

452 /*

453 * sg stuff

454 */

455 unsigned int sg_timeout;

456 unsigned int sg_reserved_size;

457 int node;

458 #ifdef CONFIG_BLK_DEV_IO_TRACE

459 struct blk_trace *blk_trace;

460 #endif

461 /*

462 * reserved for flush operations

463 */

464 unsigned int ordered, next_ordered, ordseq;

465 int orderr, ordcolor;

466 struct request pre_flush_rq, bar_rq, post_flush_rq;

467 struct request *orig_bar_rq;

468 unsigned int bi_size;

469

470 struct mutex sysfs_lock;

471 };

这里我们看到了rq其实是struct request_list结构体变量.这个结构体定义于同一个文件.

131 struct request_list {

132 int count[2];

133 int starved[2];

134 int elvpriv;

135 mempool_t *rq_pool;

136 wait_queue_head_t wait[2];

137 };

不过这些我们现在都不想看,我们想看的只有其中的几个函数,第一个是2125行blk_alloc_request().来自ll_rw_blk.c:

1970 static struct request *

1971 blk_alloc_request(request_queue_t *q, int rw, int priv, gfp_t gfp_mask)

1972 {

1973 struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);

1974

1975 if (!rq)

1976 return NULL;

1977

1978 /*

1979 * first three bits are identical in rq->cmd_flags and bio->bi_rw,

1980 * see bio.h and blkdev.h

1981 */

1982 rq->cmd_flags = rw | REQ_ALLOCED;

1983

1984 if (priv) {

1985 if (unlikely(elv_set_request(q, rq, gfp_mask))) {

1986 mempool_free(rq, q->rq.rq_pool);

1987 return NULL;

1988 }

1989 rq->cmd_flags |= REQ_ELVPRIV;

1990 }

1991

1992 return rq;

1993 }

其它我们不懂没有关系,至少我们从1972行可以看出这里申请了一个struct request的结构体指针,换句话说,此前,我们已经有了请求队列,但是没有实质性的元素,从这一刻起,我们有了一个真正的request.虽然现在还没有进入到队伍中去,但这只是早晚的事儿了.

下一个rq_init().

238 static void rq_init(request_queue_t *q, struct request *rq)

239 {

240 INIT_LIST_HEAD(&rq->queuelist);

241 INIT_LIST_HEAD(&rq->donelist);

242

243 rq->errors = 0;

244 rq->bio = rq->biotail = NULL;

245 INIT_HLIST_NODE(&rq->hash);

246 RB_CLEAR_NODE(&rq->rb_node);

247 rq->ioprio = 0;

248 rq->buffer = NULL;

249 rq->ref_count = 1;

250 rq->q = q;

251 rq->special = NULL;

252 rq->data_len = 0;

253 rq->data = NULL;

254 rq->nr_phys_segments = 0;

255 rq->sense = NULL;

256 rq->end_io = NULL;

257 rq->end_io_data = NULL;

258 rq->completion_data = NULL;

259 }

这个函数在干什么不用我说,浦东金杨新村卖麻辣烫的大妈都知道,对刚申请的rq进行初始化.

然后,get_request()就开开心心的返回了,正常情况下,get_request_wait()也会跟着返回,再接着,blk_get_request()也就返回了.我们也带着申请好初始化好的req回到scsi_execute()中去,而接下来一段代码就是我们最关心的,对req的真正的赋值.比如req->cmd_len,req->cmd等等,就是这样被赋上的.换言之,我们的scsi命令就是这样被request拖下水的,从此它们之间不再是以前那种”水留不住落花的漂泊,落花走不进水的世界”的关系,而是沦落到了一荣俱荣一损俱损狼狈为奸的关系.