Linux 块设备驱动

Linux 块设备驱动

块设备

块设备是针对存储设备的,比如 SD 卡、 EMMC、 NAND Flash、 Nor Flash、 SPI Flash、机械硬盘、固态硬盘等

所以 块设备驱动其实就是这些存储设备驱动

与字符设备相比 :

块设备只能以块为单位进行读写访问,块是 linux 虚拟文件系统**(VFS**)基本的数据传输单位。

字符设备是以字节为单位进行数据传输的,不需要缓冲。

块设备在结构上是可以进行随机访问的,对于这些设备的读写都是按块进行的,块设备一般都是使用缓冲区来暂时存放数据,等到条件成熟后 , 在一次性将缓冲区中的数据写入块设备中。这么做的目的为了减少了对块设备的擦除次数 , 提高块设备寿命

字符设备是顺序的数据流设备,字符设备是按照字节进行读写访问的。字符设备不需要缓冲区,对于字符设备的访问都是实时的,而且也不需要按照固定的块大小进行访问。


块设备驱动框架

block_device 表示 块设备

// include/linux/fs.h

struct block_device 
{
	dev_t			bd_dev;  /* not a kdev_t - it's a search key */
	int			bd_openers;
	struct inode *		bd_inode;	/* will die */
	struct super_block *	bd_super;
	struct mutex		bd_mutex;	/* open/close mutex */
	void *			bd_claiming;
	void *			bd_holder;
	int			bd_holders;
	bool			bd_write_holder;
#ifdef CONFIG_SYSFS
	struct list_head	bd_holder_disks;
#endif
	struct block_device *	bd_contains;
	unsigned		bd_block_size;
	u8			bd_partno;
	struct hd_struct *	bd_part;
	/* number of times partitions within this device have been opened. */
	unsigned		bd_part_count;
	int			bd_invalidated;
	struct gendisk *	bd_disk;	/* gendisk 结构体指针类型 */
	struct request_queue *  bd_queue;
	struct backing_dev_info *bd_bdi;
	struct list_head	bd_list;
	/*
	 * Private data.  You must have bd_claim'ed the block_device
	 * to use this.  NOTE:  bd_claim allows an owner to claim
	 * the same device multiple times, the owner must take special
	 * care to not mess up bd_private for that case.
	 */
	unsigned long		bd_private;

	/* The counter of freeze processes */
	int			bd_fsfreeze_count;
	/* Mutex for freeze */
	struct mutex		bd_fsfreeze_mutex;
} __randomize_layout;

注册块设备

// block/genhd.c

/**
 * @function: 注册块设备
 * @parameter: 
 * 		major: 主设备号
 * 		name: 块设备名字
 * @return: 
 *     success: 1~255 之间的话表示自定义主设备号, 返回 0 
 * 			为 0 的话表示由系统自动分配主设备号, 返回值主设备号
 *     error: 负值
 * @note: 
 */
int register_blkdev(unsigned int major, const char *name)

注销块设备

// block/genhd.c

/**
 * @function: 注销块设备
 * @parameter: 
 * 		major: 要注销的块设备主设备号
 * 		name: 要注销的块设备名字
 * @return: 
 *     success: 
 *     error: 
 * @note: 
 */
void unregister_blkdev(unsigned int major, const char *name)

gendisk 结构体

gendisk 描述一个磁盘设备

// include/linux/genhd.h
//描述一个磁盘设备
struct gendisk 
{
	/* major, first_minor and minors are input parameters only,
	 * don't use directly.  Use disk_devt() and disk_max_parts().
	 */
	int major;			/* 磁盘设备的主设备号 */ /* major number of driver */
	int first_minor;	/* 磁盘的第一个次设备号 */
	int minors;   	 	/* 磁盘的分区数量 */  /* maximum number of minors, =1 for
                                         * disks that can't be partitioned. */

	char disk_name[DISK_NAME_LEN];	/* name of major driver */
	char *(*devnode)(struct gendisk *gd, umode_t *mode);

	unsigned short events;		/* supported events */
	unsigned short event_flags;	/* flags related to event processing */

	/* Array of pointers to partitions indexed by partno.
	 * Protected with matching bdev lock but stat and other
	 * non-critical accesses use RCU.  Always access through
	 * helpers.
	 */
	struct disk_part_tbl __rcu *part_tbl;	/* 磁盘对应的分区表 */
	struct hd_struct part0;

	const struct block_device_operations *fops;	/* 块设备操作集 */
	struct request_queue *queue;	/* 请求队列 */
	void *private_data;

	int flags;
	struct rw_semaphore lookup_sem;
	struct kobject *slave_dir;

	struct timer_rand_state *random;
	atomic_t sync_io;		/* RAID */
	struct disk_events *ev;
#ifdef  CONFIG_BLK_DEV_INTEGRITY
	struct kobject integrity_kobj;
#endif	/* CONFIG_BLK_DEV_INTEGRITY */
	int node_id;
	struct badblocks *bb;
	struct lockdep_map lockdep_map;
};

申请 gendisk

// include/linux/genhd.h

/**
 * @function: 申请 gendisk
 * @parameter: 
 * 		minors: 次设备号数量
 * @return: 
 *     success: 申请到的 gendisk
 *     error: NULL
 * @note: 
 */
#define alloc_disk(minors) alloc_disk_node(minors, NUMA_NO_NODE)

删除 gendisk

// block/genhd.c
/**
 * @function: 删除 gendisk
 * @parameter: 
 * 		disk : 要删除的 gendisk
 * @return: 
 *     success: 
 *     error: 
 * @note: 
 */
void del_gendisk(struct gendisk *disk)

将 gendisk 添加到内核

// include/linux/genhd.h

/**
 * @function: 将 gendisk 添加到内核
 * @parameter: 
 * 		disk: 要添加到内核的 gendisk
 * @return: 
 *     success: 
 *     error: 
 * @note: 
 */
static inline void add_disk(struct gendisk *disk)

设置 gendisk 容量

// include/linux/genhd.h

/**
 * @function: 设置 gendisk 容量
 * @parameter: 
 * 		disk: 要设置容量的 gendisk
 * 		size: 磁盘容量大小
 * @return: 
 *     success: 
 *     error: 
 * @note: 
 *		2MB 的磁盘,扇区数量 = (2*1024*1024)/512 = 4096
 */
static inline void set_capacity(struct gendisk *disk, sector_t size)

调整 gendisk 引用计数

//block/genhd.c

// 增加 gendisk 的引用计数 
struct kobject *get_disk(struct gendisk *disk)

/**
 * @function: 减少 gendisk 的引用计数
 * @parameter: 
 * @return: 
 *     success: 
 *     error: 
 * @note: 
 */
void put_disk(struct gendisk *disk)

block_device_operations 结构体

// include/linux/blkdev.h

//块设备的操作集
struct block_device_operations 
{
	int (*open) (struct block_device *, fmode_t);	/* 打开指定的块设备 */
	void (*release) (struct gendisk *, fmode_t);	/* 关闭(释放)指定的块设备 */
	int (*rw_page)(struct block_device *, sector_t, struct page *, unsigned int);	/* 读写指定的页 */
	int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);		/* 块设备的 I/O 控制 */
	int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);	/* 块设备的 I/O 控制 */
	unsigned int (*check_events) (struct gendisk *disk, 
				      unsigned int clearing);
	/* ->media_changed() is DEPRECATED, use ->check_events() instead */
	int (*media_changed) (struct gendisk *);
	void (*unlock_native_capacity) (struct gendisk *);
	int (*revalidate_disk) (struct gendisk *);
	int (*getgeo)(struct block_device *, struct hd_geometry *);	/* 获取磁盘信息 */
	/* this callback is with swap_lock and sometimes page table lock held */
	void (*swap_slot_free_notify) (struct block_device *, unsigned long);
	int (*report_zones)(struct gendisk *, sector_t sector,
			unsigned int nr_zones, report_zones_cb cb, void *data);
	struct module *owner;	/* 此结构体属于哪个模块 */
	const struct pr_ops *pr_ops;
};

块设备 I/O 请求过程

请求队列 request_queue

//include/linux/blkdev.h

struct request_queue 
{
	//...
	/* sw queues */
	struct blk_mq_ctx __percpu	*queue_ctx;

	unsigned int		queue_depth;

	/* hw dispatch queues */
	struct blk_mq_hw_ctx	**queue_hw_ctx;
	unsigned int		nr_hw_queues;

	struct backing_dev_info	*backing_dev_info;

	/*
	 * The queue owner gets to use this for whatever they like.
	 * ll_rw_blk doesn't touch it.
	 */
	void			*queuedata;

	/*
	 * various queue flags, see QUEUE_* below
	 */
	unsigned long		queue_flags;
	/*
	 * Number of contexts that have called blk_set_pm_only(). If this
	 * counter is above zero then only RQF_PM and RQF_PREEMPT requests are
	 * processed.
	 */
	atomic_t		pm_only;

	/*
	 * ida allocated id for this queue.  Used to index queues from
	 * ioctx.
	 */
	int			id;

	/*
	 * queue needs bounce pages for pages above this limit
	 */
	gfp_t			bounce_gfp;

	spinlock_t		queue_lock;

	/*
	 * queue kobject
	 */
	struct kobject kobj;

	/*
	 * mq queue kobject
	 */
	struct kobject *mq_kobj;

#ifdef  CONFIG_BLK_DEV_INTEGRITY
	struct blk_integrity integrity;
#endif	/* CONFIG_BLK_DEV_INTEGRITY */

#ifdef CONFIG_PM
	struct device		*dev;
	int			rpm_status;
	unsigned int		nr_pending;
#endif

	/*
	 * queue settings
	 */
	unsigned long		nr_requests;	/* Max # of requests */


    

#ifdef CONFIG_BLK_DEV_ZONED
	/*
	 * Zoned block device information for request dispatch control.
	 * nr_zones is the total number of zones of the device. This is always
	 * 0 for regular block devices. conv_zones_bitmap is a bitmap of nr_zones
	 * bits which indicates if a zone is conventional (bit set) or
	 * sequential (bit clear). seq_zones_wlock is a bitmap of nr_zones
	 * bits which indicates if a zone is write locked, that is, if a write
	 * request targeting the zone was dispatched. All three fields are
	 * initialized by the low level device driver (e.g. scsi/sd.c).
	 * Stacking drivers (device mappers) may or may not initialize
	 * these fields.
	 *
	 * Reads of this information must be protected with blk_queue_enter() /
	 * blk_queue_exit(). Modifying this information is only allowed while
	 * no requests are being processed. See also blk_mq_freeze_queue() and
	 * blk_mq_unfreeze_queue().
	 */
	unsigned int		nr_zones;
	unsigned long		*conv_zones_bitmap;
	unsigned long		*seq_zones_wlock;
#endif /* CONFIG_BLK_DEV_ZONED */

	/*
	 * sg stuff
	 */
	unsigned int		sg_timeout;
	unsigned int		sg_reserved_size;
	int			node;
#ifdef CONFIG_BLK_DEV_IO_TRACE
	struct blk_trace	*blk_trace;
	struct mutex		blk_trace_mutex;
#endif
	/*
	 * for flush operations
	 */
	struct blk_flush_queue	*fq;

	struct list_head	requeue_list;
	spinlock_t		requeue_lock;
	struct delayed_work	requeue_work;

	struct mutex		sysfs_lock;
	struct mutex		sysfs_dir_lock;

	/*
	 * for reusing dead hctx instance in case of updating
	 * nr_hw_queues
	 */
	struct list_head	unused_hctx_list;
	spinlock_t		unused_hctx_lock;

	int			mq_freeze_depth;

#if defined(CONFIG_BLK_DEV_BSG)
	struct bsg_class_device bsg_dev;
#endif

#ifdef CONFIG_BLK_DEV_THROTTLING
	/* Throttle data */
	struct throtl_data *td;
#endif
	struct rcu_head		rcu_head;
	wait_queue_head_t	mq_freeze_wq;
	/*
	 * Protect concurrent access to q_usage_counter by
	 * percpu_ref_kill() and percpu_ref_reinit().
	 */
	struct mutex		mq_freeze_lock;
	struct percpu_ref	q_usage_counter;

	struct blk_mq_tag_set	*tag_set;
	struct list_head	tag_set_list;
	struct bio_set		bio_split;

#ifdef CONFIG_BLK_DEBUG_FS
	struct dentry		*debugfs_dir;
	struct dentry		*sched_debugfs_dir;
	struct dentry		*rqos_debugfs_dir;
#endif

	bool			mq_sysfs_init_done;

	size_t			cmd_size;

	struct work_struct	release_work;

#define BLK_MAX_WRITE_HINTS	5
	u64			write_hints[BLK_MAX_WRITE_HINTS];
};
初始化请求队列
/**
 * @function: 初始化请求队列
 * @parameter: 
 * 		rfn: 请求处理函数指针
 * 		lock: 自旋锁指针
 * @return: 
 *     success: 申请到的 request_queue 地址
 *     error: NULL
 * @note: 给请求队列分配一个 I/O 调度器,用于机械存储设备,比如机械硬盘等
 */
request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
删除请求队列
// block\blk-core.c
/**
 * @function: 删除请求队列
 * @parameter: 
 * 		q: 需要删除的请求队列
 * @return: 
 *     success: 
 *     error: 
 * @note: 
 */
void blk_cleanup_queue(struct request_queue *q)
分配请求队列并绑定制造请求函数
// block\blk-core.c
/**
 * @function:  request_queue 申请
 * @parameter: 
 * 		gfp_mask: 内存分配掩码	, 参考 include/linux/gfp.h 中的相关宏定义
 * @return: 
 *     success: 无 I/O 调度的 request_queue
 *     error: 
 * @note: 用于那么非机械的存储设备、无需 I/O 调度器,比如 EMMC、 SD 卡等
 */
struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
// block\blk-settings.c

/**
 * @function: 请求队列绑定一个“制造请求”
 * @parameter: 
 * 		q: 需要绑定的请求队列
 * 		mfn:需要绑定的“制造”请求
 * @return: 
 *     success: 
 *     error: 
 * @note: 用于那么非机械的存储设备、无需 I/O 调度器,比如 EMMC、 SD 卡等
 */
void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)

请求 request

// include/linux/blkdev.h
struct request 
{
	//...
	unsigned int cmd_flags;		/* op and common flags */
	req_flags_t rq_flags;

	int tag;
	int internal_tag;

	/* the following two fields are internal, NEVER access directly */
	unsigned int __data_len;	/* total data len */
	sector_t __sector;		/* sector cursor */

	struct bio *bio;
	struct bio *biotail;

	struct list_head queuelist;

	/*
	 * The hash is used inside the scheduler, and killed once the
	 * request reaches the dispatch list. The ipi_list is only used
	 * to queue the request for softirq completion, which is long
	 * after the request has been unhashed (and even removed from
	 * the dispatch list).
	 */
	union {
		struct hlist_node hash;	/* merge hash */
		struct list_head ipi_list;
	};

	/*
	 * The rb_node is only used inside the io scheduler, requests
	 * are pruned when moved to the dispatch queue. So let the
	 * completion_data share space with the rb_node.
	 */
	union {
		struct rb_node rb_node;	/* sort/lookup */
		struct bio_vec special_vec;
		void *completion_data;
		int error_count; /* for legacy drivers, don't use */
	};

	/*
	 * Three pointers are available for the IO schedulers, if they need
	 * more they have to dynamically allocate it.  Flush requests are
	 * never put on the IO scheduler. So let the flush fields share
	 * space with the elevator data.
	 */
	union {
		struct {
			struct io_cq		*icq;
			void			*priv[2];
		} elv;

		struct {
			unsigned int		seq;
			struct list_head	list;
			rq_end_io_fn		*saved_end_io;
		} flush;
	};

	struct gendisk *rq_disk;
	struct hd_struct *part;
#ifdef CONFIG_BLK_RQ_ALLOC_TIME
	/* Time that the first bio started allocating this request. */
	u64 alloc_time_ns;
#endif
	/* Time that this request was allocated for this IO. */
	u64 start_time_ns;
	/* Time that I/O was submitted to the device. */
	u64 io_start_time_ns;

	/*
	 * rq sectors used for blk stats. It has the same value
	 * with blk_rq_sectors(rq), except that it never be zeroed
	 * by completion.
	 */
	unsigned short stats_sectors;

	/*
	 * Number of scatter-gather DMA addr+len pairs after
	 * physical address coalescing is performed.
	 */
	unsigned short nr_phys_segments;
	//...

	unsigned int extra_len;	/* length of alignment and padding */

	//...

	/*
	 * completion callback.
	 */
	rq_end_io_fn *end_io;
	void *end_io_data;
};
获取请求
/**
 * @function: 从request_queue中依次获取每个request
 * @parameter: 
 * 		q: 指定 request_queue
 * @return: 
 *     success: request_queue 中下一个要处理的请求(request)
 *     error: NULL
 * @note: 
 */
request *blk_peek_request(struct request_queue *q)
开启请求
/**
 * @function: 获取到下一个要处理的请求以后就要开始处理这个请求
 * @parameter: 
 * 		req: 要开始处理的请求
 * @return: 
 *     success: 
 *     error:
 * @note: 
 */
void blk_start_request(struct request *req)
一步到位处理请求
//一次性完成请求的获取和开启
struct request *blk_fetch_request(struct request_queue *q)
{
	struct request *rq;

	rq = blk_peek_request(q);
    
	if (rq)
    {
        blk_start_request(rq);
    }
	
	return rq;
}
其他和请求有关的函数
//请求中指定字节数据被处理完成
blk_end_request();
//请求中所有数据全部处理完成
blk_end_request_all();
//当前请求中的 chunk
blk_end_request_cur();
//处理完请求,直到下一个错误产生
blk_end_request_err();
//和 blk_end_request 函数一样,但是需要持有队列锁
__blk_end_request();
//和 blk_end_request_all 函数一样,但是需要持有队列锁
__blk_end_request_all();
//和 blk_end_request_cur 函数一样,但是需要持有队列锁
__blk_end_request_cur()
// 和 blk_end_request_err 函数一样,但是需要持有队列锁
__blk_end_request_err()

bio 结构

// include/linux/blk_types.h


request_queue、 request 和 bio 之间的关系 :

外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传

  • 6
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

极客程序设计

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值