linux 内核之block trim BLKDISCARD过程

对SSD进行trim,是通过ioclt发送BLKDISCARD命令完成的。

下面是block层ioctl的调用栈。

sys_ioctl->do_vfs_ioctl->block_ioctl->blkdev_ioctl->blkdev_issue_discard->__blk_run_queue->scsi_request_fn->....

具体通过代码讲解BLKDISCARD执行流程。

static long vfs_ioctl(struct file *filp, unsigned int cmd,

              unsigned long arg)

{

    int error = -ENOTTY;

    if (!filp->f_op->unlocked_ioctl)

        goto out;

    error = filp->f_op->unlocked_ioctl(filp, cmd, arg);//这里f_op注册的是def_blk_fops->unlocked_ioctl,即block_ioctl

    if (error == -ENOIOCTLCMD)

        error = -ENOTTY;

 out:

    return error;

}

static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)

{

    struct block_device *bdev = I_BDEV(file->f_mapping->host);

    fmode_t mode = file->f_mode;

    /*

     * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have

     * to updated it before every ioctl.

     */

    if (file->f_flags & O_NDELAY)

        mode |= FMODE_NDELAY;

    else

        mode &= ~FMODE_NDELAY;

    return blkdev_ioctl(bdev, mode, cmd, arg);

}

int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,

            unsigned long arg)

{

     ....

     case BLKDISCARD//对于cmd为BLKDISCARD的命令,走该分支;

    case BLKSECDISCARD: {

        uint64_t range[2];

        if (!(mode & FMODE_WRITE))

            return -EBADF;

        if (copy_from_user(range, (void __user *)arg, sizeof(range)))

            return -EFAULT;

        return blk_ioctl_discard(bdev, range[0], range[1],

                     cmd == BLKSECDISCARD);

         }

     .....

}

static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,

                 uint64_t len, int secure) //参数为起始sector,discard的长度len,单位为sector

{

    unsigned long flags = 0;

    if (start & 511)

        return -EINVAL;

    if (len & 511)

        return -EINVAL;

    start >>= 9;

    len >>= 9;

    if (start + len > (i_size_read(bdev->bd_inode) >> 9))

        return -EINVAL;

    if (secure)

        flags |= BLKDEV_DISCARD_SECURE;

    return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, flags);

}

int blkdev_issue_discard(struct block_device *bdev, sector_t sector,

        sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)

{

    DECLARE_COMPLETION_ONSTACK(wait);

    struct request_queue *q = bdev_get_queue(bdev);

    int type = REQ_WRITE | REQ_DISCARD; //将bio的类型设置为WRITE|DISCARD

    unsigned int max_discard_sectors, granularity;

    int alignment;

    struct bio_batch bb;

    struct bio *bio;

    int ret = 0;

    struct blk_plug plug;

    if (!q)

        return -ENXIO;

    if (!blk_queue_discard(q))

        return -EOPNOTSUPP;

    /* Zero-sector (unknown) and one-sector granularities are the same.  */

    granularity = max(q->limits.discard_granularity >> 9, 1U);

    alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;

    /*

     * Ensure that max_discard_sectors is of the proper

     * granularity, so that requests stay aligned after a split.

     */

    max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);

    max_discard_sectors -= max_discard_sectors % granularity;

    if (unlikely(!max_discard_sectors)) {

        /* Avoid infinite loop below. Being cautious never hurts. */

        return -EOPNOTSUPP;

    }

    if (flags & BLKDEV_DISCARD_SECURE) {

        if (!blk_queue_secdiscard(q))

            return -EOPNOTSUPP;

        type |= REQ_SECURE;

    }

    atomic_set(&bb.done, 1);

    bb.flags = 1 << BIO_UPTODATE;

    bb.wait = &wait;

    blk_start_plug(&plug);

    while (nr_sects) { 

        unsigned int req_sects;//循环发送bio请求给硬盘

        sector_t end_sect, tmp;

        bio = bio_alloc(gfp_mask, 1); //分配一个bio

        if (!bio) {

            ret = -ENOMEM;

            break;

        }

        req_sects = min_t(sector_t, nr_sects, max_discard_sectors);

        /*

         * If splitting a request, and the next starting sector would be

         * misaligned, stop the discard at the previous aligned sector.

         */

        end_sect = sector + req_sects;

        tmp = end_sect;

        if (req_sects < nr_sects &&

            sector_div(tmp, granularity) != alignment) {

            end_sect = end_sect - alignment;

            sector_div(end_sect, granularity);

            end_sect = end_sect * granularity + alignment;

            req_sects = end_sect - sector;

        }

        bio->bi_iter.bi_sector = sector; //初始化bio的一些成员

        bio->bi_end_io = bio_batch_end_io;

        bio->bi_bdev = bdev;

        bio->bi_private = &bb;

        bio->bi_iter.bi_size = req_sects << 9;

        nr_sects -= req_sects;

        sector = end_sect;

        atomic_inc(&bb.done);

        submit_bio(type, bio);//发送bio

        /*

         * We can loop for a long time in here, if someone does

         * full device discards (like mkfs). Be nice and allow

         * us to schedule out to avoid softlocking if preempt

         * is disabled.

         */

        cond_resched();

    }

    blk_finish_plug(&plug);

    /* Wait for bios in-flight */

    if (!atomic_dec_and_test(&bb.done))

        wait_for_completion_io(&wait);

    if (!test_bit(BIO_UPTODATE, &bb.flags))

        ret = -EIO;

    return ret;

}

发送bio的调用栈:

submit_bio->generic_make_request->blk_queue_bio{这里将bio转换成request,并插入到request_queue中}->__blk_run_queue->scsi_request_fn.


在scsi_request_fn中,首先通过blk_peek_request从request_queue中取出一个req,并通过scsi_prep_fn,对req做一些准备工作。

static int scsi_prep_fn(struct request_queue *q, struct request *req)

{

    struct scsi_device *sdev = q->queuedata;

    struct scsi_cmnd *cmd;

    int ret;

    ret = scsi_prep_state_check(sdev, req);

    if (ret != BLKPREP_OK)

        goto out;

    cmd = scsi_get_cmd_from_req(sdev, req);//通过req构建scsi_cmd结构体,完成block向scsi层转换

    if (unlikely(!cmd)) {

        ret = BLKPREP_DEFER;

        goto out;

    }

    if (req->cmd_type == REQ_TYPE_FS)

        ret = scsi_cmd_to_driver(cmd)->init_command(cmd);//对于req属于fs的类型,调用sd_init_command;

    else if (req->cmd_type == REQ_TYPE_BLOCK_PC)

        ret = scsi_setup_blk_pc_cmnd(sdev, req);

    else

        ret = BLKPREP_KILL;

out:

    return scsi_prep_return(q, req, ret);

}

对于BLKDISCARD的情形,发送的为FS TYPE的cmd,因此,这里走sd_init_command;

static int sd_init_command(struct scsi_cmnd *SCpnt)

{

    struct request *rq = SCpnt->request;

    struct scsi_device *sdp = SCpnt->device;

    struct gendisk *disk = rq->rq_disk;

    struct scsi_disk *sdkp;

    sector_t block = blk_rq_pos(rq);

    sector_t threshold;

    unsigned int this_count = blk_rq_sectors(rq);

    int ret, host_dif;

    unsigned char protect;

    /*

     * Discard request come in as REQ_TYPE_FS but we turn them into

     * block PC requests to make life easier.

     */

    if (rq->cmd_flags & REQ_DISCARD) { //由于cmd_flags包含有REQ_DISCARD flag,因此走该分支;

        ret = sd_setup_discard_cmnd(sdp, rq);

        goto out;

    } else if (rq->cmd_flags & REQ_WRITE_SAME) {

        ret = sd_setup_write_same_cmnd(sdp, rq);

        goto out;

    } else if (rq->cmd_flags & REQ_FLUSH) {

        ret = scsi_setup_flush_cmnd(sdp, rq);

        goto out;

    }

     ...

     正常的IO请求初始化走下面分支,这里不再介绍。

     ...

}

static int sd_setup_discard_cmnd(struct scsi_device *sdp, struct request *rq)

{

    struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);

    sector_t sector = blk_rq_pos(rq);

    unsigned int nr_sectors = blk_rq_sectors(rq);

    unsigned int nr_bytes = blk_rq_bytes(rq);

    unsigned int len;

    int ret;

    char *buf;

    struct page *page;

    sector >>= ilog2(sdp->sector_size) - 9;

    nr_sectors >>= ilog2(sdp->sector_size) - 9;

    rq->timeout = SD_TIMEOUT;

    memset(rq->cmd, 0, rq->cmd_len);

    page = alloc_page(GFP_ATOMIC | __GFP_ZERO);//分配一个全零的Page

    if (!page)

        return BLKPREP_DEFER;

    switch (sdkp->provisioning_mode) { //根据scsi device的规定的等级,选择不同的discard command,这里provisioning_mode为SD_LBP_WS16

    case SD_LBP_UNMAP:

        buf = page_address(page);

        rq->cmd_len = 10;

        rq->cmd[0] = UNMAP;

        rq->cmd[8] = 24;

        put_unaligned_be16(6 + 16, &buf[0]);

        put_unaligned_be16(16, &buf[2]);

        put_unaligned_be64(sector, &buf[8]);

        put_unaligned_be32(nr_sectors, &buf[16]);

        len = 24;

        break;

    case SD_LBP_WS16:

        rq->cmd_len = 16;

        rq->cmd[0] = WRITE_SAME_16;

        rq->cmd[1] = 0x8; /* UNMAP */

        put_unaligned_be64(sector, &rq->cmd[2]);

        put_unaligned_be32(nr_sectors, &rq->cmd[10]);

        len = sdkp->device->sector_size;

        break;

    case SD_LBP_WS10:

    case SD_LBP_ZERO:

        rq->cmd_len = 10;

        rq->cmd[0] = WRITE_SAME;

        if (sdkp->provisioning_mode == SD_LBP_WS10)

            rq->cmd[1] = 0x8; /* UNMAP */

        put_unaligned_be32(sector, &rq->cmd[2]);

        put_unaligned_be16(nr_sectors, &rq->cmd[7]);

        len = sdkp->device->sector_size;

        break;

    default:

        ret = BLKPREP_KILL;

        goto out;

    }

    rq->completion_data = page;//这里将page 赋值给rq->completion_data,在最后scsi_unprep_fn,调用sd_uninit_command中,通过该变量来释放该page的。

    blk_add_request_payload(rq, page, len);

    ret = scsi_setup_blk_pc_cmnd(sdp, rq);

    rq->__data_len = nr_bytes;

out:

    if (ret != BLKPREP_OK)

        __free_page(page);

    return ret;

}

static void sd_uninit_command(struct scsi_cmnd *SCpnt)

{

    struct request *rq = SCpnt->request;

    if (rq->cmd_flags & REQ_DISCARD)

        __free_page(rq->completion_data);

    if (SCpnt->cmnd != rq->cmd) {

        mempool_free(SCpnt->cmnd, sd_cdb_pool);

        SCpnt->cmnd = NULL;

        SCpnt->cmd_len = 0;

    }

}

接下来,将初始化的scsi_cmd dispatch出去。

scsi_request_fn->scsi_dispatch_cmd->ata_scsi_queuecmd->__ata_scsi_queuecmd.

static inline int __ata_scsi_queuecmd(struct scsi_cmnd *scmd,

                      struct ata_device *dev)

{

    u8 scsi_op = scmd->cmnd[0];

    ata_xlat_func_t xlat_func;

    int rc = 0;

    if (dev->class == ATA_DEV_ATA) {//针对ATA device,走该分支

        if (unlikely(!scmd->cmd_len || scmd->cmd_len > dev->cdb_len))

            goto bad_cdb_len;

        xlat_func = ata_get_xlat_func(dev, scsi_op);

    } else {

        if (unlikely(!scmd->cmd_len))

            goto bad_cdb_len;

        xlat_func = NULL;

        if (likely((scsi_op != ATA_16) || !atapi_passthru16)) {

            /* relay SCSI command to ATAPI device */

            int len = COMMAND_SIZE(scsi_op);

            if (unlikely(len > scmd->cmd_len || len > dev->cdb_len))

                goto bad_cdb_len;

            xlat_func = atapi_xlat;

        } else {

            /* ATA_16 passthru, treat as an ATA command */

            if (unlikely(scmd->cmd_len > 16))

                goto bad_cdb_len;

            xlat_func = ata_get_xlat_func(dev, scsi_op);

        }

    }

    if (xlat_func)

        rc = ata_scsi_translate(dev, scmd, xlat_func);

    else

        ata_scsi_simulate(dev, scmd);

    return rc;

 bad_cdb_len:

    DPRINTK("bad CDB len=%u, scsi_op=0x%02x, max=%u\n",

        scmd->cmd_len, scsi_op, dev->cdb_len);

    scmd->result = DID_ERROR << 16;

    scmd->scsi_done(scmd);

    return 0;

}

static inline ata_xlat_func_t ata_get_xlat_func(struct ata_device *dev, u8 cmd)

{

    switch (cmd) {

    case READ_6:

    case READ_10:

    case READ_16:

    case WRITE_6:

    case WRITE_10:

    case WRITE_16:

        return ata_scsi_rw_xlat;

    case WRITE_SAME_16:

        return ata_scsi_write_same_xlat;//由于discard通过write_same_16 cmd实现,因此xlat_fn=ata_scsi_write_same_xlat;

    case SYNCHRONIZE_CACHE:

        if (ata_try_flush_cache(dev))

            return ata_scsi_flush_xlat;

        break;

    case VERIFY:

    case VERIFY_16:

        return ata_scsi_verify_xlat;

    case ATA_12:

    case ATA_16:

        return ata_scsi_pass_thru;

    case MODE_SELECT:

    case MODE_SELECT_10:

        return ata_scsi_mode_select_xlat;

        break;

    case START_STOP:

        return ata_scsi_start_stop_xlat;

    }

    return NULL;

}

下面通过ata_scsi_translate把scsi_cmd转换成具体的ata_command发送出去。

static int ata_scsi_translate(struct ata_device *dev, struct scsi_cmnd *cmd,

                  ata_xlat_func_t xlat_func)

{

    struct ata_port *ap = dev->link->ap;

    struct ata_queued_cmd *qc;

    int rc;

    VPRINTK("ENTER\n");

    qc = ata_scsi_qc_new(dev, cmd);

    if (!qc)

        goto err_mem;

    /* data is present; dma-map it */

    if (cmd->sc_data_direction == DMA_FROM_DEVICE ||

        cmd->sc_data_direction == DMA_TO_DEVICE) {

        if (unlikely(scsi_bufflen(cmd) < 1)) {

            ata_dev_warn(dev, "WARNING: zero len r/w req\n");

            goto err_did;

        }

        ata_sg_init(qc, scsi_sglist(cmd), scsi_sg_count(cmd));

        qc->dma_dir = cmd->sc_data_direction;

    }

    qc->complete_fn = ata_scsi_qc_complete;

    if (xlat_func(qc)) //具体转换函数

        goto early_finish;

    if (ap->ops->qc_defer) {

        if ((rc = ap->ops->qc_defer(qc)))

            goto defer;

    }

    /* select device, send command to hardware */

    ata_qc_issue(qc); //具体调用sata driver的函数发送命令给controller。

    VPRINTK("EXIT\n");

    return 0;

}

命令完成过程,跟正常command 命令完成流程一样,这里不再列出。

  • 3
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
Linux的SSD TRIM是一种用于优化固态硬盘(SSD)性能和寿命的技术。TRIM是一项操作系统级别的指令,用于通知SSD哪些数据块不再被使用,可以被擦除和重写。这有助于提高SSD的写入性能,并减少因为重写未使用数据块而引起的性能下降。 要启用SSD TRIM功能,您需要进行以下步骤: 1. 确保您的SSD支持TRIM。大多数现代SSD都支持这个功能,但一些较旧或低端的SSD可能不支持。您可以查看SSD制造商的文档或技术规格来确认它是否支持TRIM。 2. 检查您的Linux发行版是否已启用TRIM。大多数现代Linux发行版默认情况下已启用TRIM。您可以运行以下命令来检查: ``` sudo systemctl status fstrim.timer ``` 如果输出显示"active"或"enabled",则表示TRIM已启用。 3. 确保您的文件系统支持TRIM。大多数常见的文件系统,如ext4和XFS,都支持TRIM。您可以通过检查`/etc/fstab`文件中的文件系统挂载选项来确认是否启用了TRIM。 例如,对于ext4文件系统,您应该看到类似于以下内容的挂载选项: ``` UUID=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx / ext4 discard,noatime,errors=remount-ro 0 1 ``` 请注意,`discard`选项用于启用TRIM。 4. 手动运行TRIM命令。如果TRIM没有自动运行,您可以手动运行TRIM命令来清理未使用的数据块。使用以下命令: ``` sudo fstrim -av ``` 这将触发对所有已挂载文件系统的TRIM操作。 请谨慎使用TRIM命令,因为它会触发SSD上的擦除操作,可能会导致数据丢失。确保在运行TRIM之前备份重要数据。 希望这些信息对您有所帮助!

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值