以linux内核3.13版本为例,首先内核通过系统调用read()
,执行sys_read()
函数,在文件linux/fs/read_write.c
中:
//linux/fs/read_write.c
SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
struct fd f = fdget(fd); //先根据文件描述符fd得到对应的file对象
ssize_t ret = -EBADF;
if (f.file) {
loff_t pos = file_pos_read(f.file); //得到文件的当前位置
ret = vfs_read(f.file, buf, count, &pos); //调用vfs_read函数
if (ret >= 0)
file_pos_write(f.file, pos); //更新文件当前位置
fdput(f);
}
return ret;
}
每个进程的进程控制块task_struct
中都有一个files_struct
结构体,它保存了进程所有打开的文件,以文件描述符fd为索引即可找到对应的file对象,file对象中也包含了文件当前位置的信息。
再来看vfs_read
函数,同样在文件linux/fs/read_write.c
中:
//linux/fs/read_write.c
ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;
if (!(file->f_mode & FMODE_READ))
return -EBADF;
if (!file->f_op->read && !file->f_op->aio_read)
return -EINVAL;
if (unlikely(!access_ok(VERIFY_WRITE, buf, count))) //用户缓冲区是否可写
return -EFAULT;
ret = rw_verify_area(READ, file, pos, count); //检验文件的锁
if (ret >= 0) {
count = ret;
if (file->f_op->read)
ret = file->f_op->read(file, buf, count, pos);
else
ret = do_sync_read(file, buf, count, pos);
if (ret > 0) {
fsnotify_access(file);
add_rchar(current, ret);
}
inc_syscr(current);
}
return ret;
}
如果文件定义了read
函数,由调用文件自身的read
函数,否则调用do_sync_read()
函数。file->f_op
是从对应的inode->i_fop
而来,而inode->i_fop
是由对应的文件系统类型在生成这个inode时赋予的file->f_op->read
对于磁盘文件系统来说通常就等同于do_sync_read()
,比如ext2文件系统。
来看一下do_sync_read()
函数:
//linux/fs/read_write.c
ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
//iovec结构体用来描述一个读/写操作的用户缓冲区,iov_base是缓冲区起点,iov_len是缓冲区长度,kiocb结构体用来描述文件对象、位置和字数等
//linux系统的一次读取请求过程中可以支持多个不连续数据段,每个数据段用一个iovec结构体表示。系统调用sys_read()每次只使用一个数据段,但是sys_readv()则可以使用多个数据段
struct iovec iov = { .iov_base = buf, .iov_len = len };
struct kiocb kiocb;
ssize_t ret;
//初始化同步控制块kiocb
init_sync_kiocb(&kiocb, filp);
kiocb.ki_pos = *ppos;
kiocb.ki_nbytes = len;
//调用文件系统的异步读操作,此函数只是提交请求到磁盘
ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
//如果值为-EIOCBQUEUED,则说明请求尚在队列中,需要等待操作完成
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&kiocb); //进程设置为TASK_UNINTERRUPTIBLE,等待kiocb的成员ki_ctx变为有效值
*ppos = kiocb.ki_pos;
return ret;
}
do_sync_read()
函数里继续调用了本文件的f_op->aio_read()
函数进行异步读操作,最后还需要调用wait_on_sync_kiocb()
函数进行同步(即wait_on_sync_kiocb()
函数返回时数据已经准备好)。对于ext2文件系统,其f_op->aio_read()
函数指向通用的generic_file_aio_read()
。
来看一下generic_file_aio_read()
函数:
//linux/mm/filemap.c
ssize_t
generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
struct file *filp = iocb->ki_filp;
ssize_t retval;
unsigned long seg = 0;
size_t count;
loff_t *ppos = &iocb->ki_pos;
count = 0;
//逐段进行用户缓冲区的可写检查并返回iovec的数目nr_segs
retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
if (retval)
return retval;
/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
//direct IO,不经过页缓存。将页缓存中的数据与设备同步之后清除页缓存中的内容,然后再调用文件系统提供的address_space->direct_IO方法从设备读取数据。
if (filp->f_flags & O_DIRECT) {
loff_t size;
struct address_space *mapping;
struct inode *inode;
mapping = filp->f_mapping;
inode = mapping->host;
if (!count)
goto out; /* skip atime */
size = i_size_read(inode);
if (pos < size) {
//将缓存内容写入设备
retval = filemap_write_and_wait_range(mapping, pos,
pos + iov_length(iov, nr_segs) - 1);
if (!retval) {
//调用文件系统提供的address_space->direct_IO方法从设备读取数据
retval = mapping->a_ops->direct_IO(READ, iocb,
iov, pos, nr_segs);
}
if (retval > 0) {
*ppos = pos + retval;
count -= retval;
}
/*
* Btrfs can have a short DIO read if we encounter
* compressed extents, so if there was an error, or if
* we've already read everything we wanted to, or if
* there was a short read because we hit EOF, go ahead
* and return. Otherwise fallthrough to buffered io for
* the rest of the read.
*/
if (retval < 0 || !count || *ppos >= size) {
file_accessed(filp);
goto out;
}
}
}
count = retval;
//对于每个iovec数组都转化为一个read_descriptor_t对象并调用do_generic_file_read函数进行处理
for (seg = 0; seg < nr_segs; seg++) {
read_descriptor_t desc;
loff_t offset = 0;
/*
* If we did a short DIO read we need to skip the section of the
* iov that we've already read data into.
*/
if (count) {
if (count > iov[seg].iov_len) {
count -= iov[seg].iov_len;
continue;
}
offset = count;
count = 0;
}