Linux read系统调用

1 read系统调用流程

本文内核版本:4.1.15
文件系统:ext3

read()
	vfs_read()
		rw_verify_area()
		__vfs_read()
			new_sync_read()
				generic_file_read_iter()	
					do_generic_file_read()

2 调用函数分析

2.1 read()

系统调用 read() :SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)

SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
	struct fd f = fdget_pos(fd);						//get struct fd from fd array
	ssize_t ret = -EBADF;

	if (f.file) {
		loff_t pos = file_pos_read(f.file);				//get read pos from file
		ret = vfs_read(f.file, buf, count, &pos);		// vfs_read
		if (ret >= 0)
			file_pos_write(f.file, pos);				// update read pos to file
		fdput_pos(f);									// update struct fd to fd array
	}
	return ret;
}

每个进程都会保存一个 struct fd 的数组代表进程打开的文件。所以根据 fd 数组的偏移地址就能找到 struct fd 。

1.2 vfs_read()
ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
	ssize_t ret;

	if (!(file->f_mode & FMODE_READ))				// is file open for read
		return -EBADF;
	if (!(file->f_mode & FMODE_CAN_READ))			// can file read
		return -EINVAL;
	if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
		return -EFAULT;

	ret = rw_verify_area(READ, file, pos, count);		// read or write verify
	if (ret >= 0) {
		count = ret;
		ret = __vfs_read(file, buf, count, pos);		// __vfs_read
		if (ret > 0) {
			fsnotify_access(file);
			add_rchar(current, ret);
		}
		inc_syscr(current);
	}

	return ret;
}
1.3 __vfs_read()
ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
		   loff_t *pos)
{
	if (file->f_op->read)
		return file->f_op->read(file, buf, count, pos);
	else if (file->f_op->read_iter)
		return new_sync_read(file, buf, count, pos);
	else
		return -EINVAL;
}

根据文件的注册 file_operations 调用相应的 read 方法:

  • 注册的 file_operations 有 read 方法,调用 read 方法,一般的设备文件会注册此类接口
  • 若 file_operations 有 read_iter 方法,调用 new_sync_read()。一般普通文件注册此类方法。

下面分别是 tty 设备与 ext3 文件系统注册的 file_operations:

static const struct file_operations tty_fops = {
	.llseek		= no_llseek,
	.read		= tty_read,
	.write		= tty_write,
	.poll		= tty_poll,
	.unlocked_ioctl	= tty_ioctl,
	.compat_ioctl	= tty_compat_ioctl,
	.open		= tty_open,
	.release	= tty_release,
	.fasync		= tty_fasync,
};

const struct file_operations ext3_file_operations = {
	.llseek		= generic_file_llseek,
	.read_iter	= generic_file_read_iter,
	.write_iter	= generic_file_write_iter,
	.unlocked_ioctl	= ext3_ioctl,
#ifdef CONFIG_COMPAT
	.compat_ioctl	= ext3_compat_ioctl,
#endif
	.mmap		= generic_file_mmap,
	.open		= dquot_file_open,
	.release	= ext3_release_file,
	.fsync		= ext3_sync_file,
	.splice_read	= generic_file_splice_read,
	.splice_write	= iter_file_splice_write,
};
1.4 new_sync_read()
static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
	struct iovec iov = { .iov_base = buf, .iov_len = len };
	struct kiocb kiocb;
	struct iov_iter iter;
	ssize_t ret;

	init_sync_kiocb(&kiocb, filp);
	kiocb.ki_pos = *ppos;
	iov_iter_init(&iter, READ, &iov, 1, len);

	ret = filp->f_op->read_iter(&kiocb, &iter);			//fs/ext3/file.c/line53
	BUG_ON(ret == -EIOCBQUEUED);
	*ppos = kiocb.ki_pos;
	return ret;
}

这里采用了 散布读(scatter read)和聚集写(gather writer)技术。具体参见 UNIX 环境高级编程14.6节。

1.5 generic_file_read_iter()
ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
	struct file *file = iocb->ki_filp;
	ssize_t retval = 0;
	loff_t *ppos = &iocb->ki_pos;
	loff_t pos = *ppos;

	if (iocb->ki_flags & IOCB_DIRECT) {		// open direct I/O, copy data from disk to user, not use page cache
		struct address_space *mapping = file->f_mapping;
		struct inode *inode = mapping->host;
		size_t count = iov_iter_count(iter);
		loff_t size;

		if (!count)
			goto out; /* skip atime */
		size = i_size_read(inode);
		retval = filemap_write_and_wait_range(mapping, pos,
					pos + count - 1);
		if (!retval) {
			struct iov_iter data = *iter;
			retval = mapping->a_ops->direct_IO(iocb, &data, pos);
		}

		if (retval > 0) {
			*ppos = pos + retval;
			iov_iter_advance(iter, retval);
		}

		/*
		 * Btrfs can have a short DIO read if we encounter
		 * compressed extents, so if there was an error, or if
		 * we've already read everything we wanted to, or if
		 * there was a short read because we hit EOF, go ahead
		 * and return.  Otherwise fallthrough to buffered io for
		 * the rest of the read.  Buffered reads will not work for
		 * DAX files, so don't bother trying.
		 */
		if (retval < 0 || !iov_iter_count(iter) || *ppos >= size ||
		    IS_DAX(inode)) {
			file_accessed(file);
			goto out;
		}
	}

	retval = do_generic_file_read(file, ppos, iter, retval);
out:
	return retval;
}

该函数分为两个部分:

  • 若是在直接 I/O 模式下打开,任何读写操作都将数据在用户态地址与磁盘间直接传送而不通过页高速缓存。这种情况暂时不分析。
  • 经过高速缓存读取文件,调用 do_generic_file_read()

do_generic_file_read() 函数是读文件的核心,同时函数也比较长,我们下篇文章再来分析。

  • 3
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值