1. 错误打印log
硬盘跑IO的时候,经常会见到下面这种错误log,这是SCSI做的打印。这是什么错误?该怎么定位?
[ 6304.176111] sd 0:0:4:0: [sde] tag#7 UNKNOWN(0x2003) Result: hostbyte=0x07 driverbyte=0x00
[ 6304.184281] sd 0:0:4:0: [sde] tag#7 CDB: opcode=0x28 28 00 00 0a 88 00 00 04 00 00
[ 6304.191838] print_req_error: I/O error, dev sde, sector 690176
[ 6333.587832] sd 0:0:4:0: [sde] tag#0 UNKNOWN(0x2003) Result: hostbyte=0x05 driverbyte=0x00
[ 6333.596010] sd 0:0:4:0: [sde] tag#0 CDB: opcode=0x28 28 00 00 0a 8c 00 00 04 00 00
2. log打印位置
首先,我们看这个日志是在哪里打印的。drivers/scsi/scsi_lib.c中scsi_io_completion()。两行分别对应scsi_print_result()和scsi_print_command()。
1012 switch (action) {
1013 case ACTION_FAIL:
1014 /* Give up and fail the remainder of the request */
1015 if (!(req->rq_flags & RQF_QUIET)) {
1016 static DEFINE_RATELIMIT_STATE(_rs,
1017 DEFAULT_RATELIMIT_INTERVAL,
1018 DEFAULT_RATELIMIT_BURST);
1019
1020 if (unlikely(scsi_logging_level))
1021 level = SCSI_LOG_LEVEL(SCSI_LOG_MLCOMPLETE_SHIFT,
1022 SCSI_LOG_MLCOMPLETE_BITS);
1023
1024 /*
1025 * if logging is enabled the failure will be printed
1026 * in scsi_log_completion(), so avoid duplicate messages
1027 */
1028 if (!level && __ratelimit(&_rs)) {
1029 scsi_print_result(cmd, NULL, FAILED);
1030 if (driver_byte(result) & DRIVER_SENSE)
1031 scsi_print_sense(cmd);
1032 scsi_print_command(cmd);
1033 }
1034 }
1035 if (!scsi_end_request(req, error, blk_rq_err_bytes(req), 0))
1036 return;
1037 /*FALLTHRU*/
1038 case ACTION_REPREP:
3. hostbyte和driverbyte含义
hostbyte和driverbyte是cmd->result中的域,如下所示。
210 #define status_byte(result) (((result) >> 1) & 0x7f)
211 #define msg_byte(result) (((result) >> 8) & 0xff)
212 #define host_byte(result) (((result) >> 16) & 0xff)
213 #define driver_byte(result) (((result) >> 24) & 0xff)
hostbyte码值对应的含义如下:
132 /*
133 * Host byte codes
134 */
135
136 #define DID_OK 0x00 /* NO error */
137 #define DID_NO_CONNECT 0x01 /* Couldn't connect before timeout period */
138 #define DID_BUS_BUSY 0x02 /* BUS stayed busy through time out period */
139 #define DID_TIME_OUT 0x03 /* TIMED OUT for other reason */
140 #define DID_BAD_TARGET 0x04 /* BAD target. */
141 #define DID_ABORT 0x05 /* Told to abort for some other reason */
142 #define DID_PARITY 0x06 /* Parity error */
143 #define DID_ERROR 0x07 /* Internal error */
144 #define DID_RESET 0x08 /* Reset by somebody. */
145 #define DID_BAD_INTR 0x09 /* Got an interrupt we weren't expecting. */
146 #define DID_PASSTHROUGH 0x0a /* Force command past mid-layer */
147 #define DID_SOFT_ERROR 0x0b /* The low level driver just wish a retry */
148 #define DID_IMM_RETRY 0x0c /* Retry without decrementing retry count */
149 #define DID_REQUEUE 0x0d /* Requeue command (no immediate retry) also
150 * without decrementing the retry count */
151 #define DID_TRANSPORT_DISRUPTED 0x0e /* Transport error disrupted execution
152 * and the driver blocked the port to
153 * recover the link. Transport class will
154 * retry or fail IO */
155 #define DID_TRANSPORT_FAILFAST 0x0f /* Transport class fastfailed the io */
156 #define DID_TARGET_FAILURE 0x10 /* Permanent target failure, do not retry on
157 * other paths */
158 #define DID_NEXUS_FAILURE 0x11 /* Permanent nexus failure, retry on other
159 * paths might yield different results */
160 #define DID_ALLOC_FAILURE 0x12 /* Space allocation on the device failed */
161 #define DID_MEDIUM_ERROR 0x13 /* Medium error */
4. hostbyte跟底层的转换关系
SAS_DATA_UNDERRUN --> DID_ERROR
SAS_DATA_OVERRUN --> DID_ERROR
SAS_ABORTED_TASK --> DID_ABORT
drivers/scsi/libsas/sas_scsi_host.c中的sas_end_task()