Linux read的核心函数generic_file_buffered_read

内核:5.9.0

流程图

 generic_file_buffered_read一种调用路径(cat某个文件触发):

#0  ondemand_readahead (mapping=0xffff888005c61340, ra=0xffff888005927598, filp=0xffff888005927500, hit_readahead_marker=false, index=0, req_size=16) at mm/readahead.c:445
#1  0xffffffff812eeea1 in page_cache_sync_readahead (req_count=<optimized out>, index=<optimized out>, filp=<optimized out>, ra=<optimized out>, mapping=<optimized out>) at mm/readahead.c:585
#2  page_cache_sync_readahead (mapping=<optimized out>, ra=0xffff888005927598, filp=0xffff888005927500, index=<optimized out>, req_count=16) at mm/readahead.c:567
#3  0xffffffff812dcae7 in generic_file_buffered_read (iocb=0xffffc90000033cc0, iter=<optimized out>, written=0) at mm/filemap.c:2199
#4  0xffffffff812dd8ed in generic_file_read_iter (iocb=0xffffc90000033cc0, iter=0xffffc90000033c98) at mm/filemap.c:2507
#5  0xffffffff814c7fc9 in ext4_file_read_iter (to=<optimized out>, iocb=<optimized out>) at fs/ext4/file.c:131
#6  ext4_file_read_iter (iocb=0xffffc90000033cc0, to=0xffffc90000033c98) at fs/ext4/file.c:114
#7  0xffffffff81405c0f in call_read_iter (file=<optimized out>, iter=<optimized out>, kio=<optimized out>) at ./include/linux/fs.h:1876
#8  generic_file_splice_read (in=0xffff888005927500, ppos=0xffffc90000033da8, pipe=<optimized out>, len=<optimized out>, flags=<optimized out>) at fs/splice.c:312
#9  0xffffffff81407b51 in do_splice_to (in=0xffff888005927500, ppos=0xffffc90000033da8, pipe=0xffff8880058fb6c0, len=65536, flags=<optimized out>) at fs/splice.c:890
#10 0xffffffff81407cab in splice_direct_to_actor (in=<optimized out>, sd=0xffffc90000033e00, actor=<optimized out>) at fs/splice.c:970
#11 0xffffffff81408012 in do_splice_direct (in=<optimized out>, ppos=0xffffc90000033ea8, out=0xffff888005927400, opos=0xffffc90000033eb0, len=16777216, flags=<optimized out>) at fs/splice.c:1079
#12 0xffffffff813ae9b1 in do_sendfile (out_fd=<optimized out>, in_fd=<optimized out>, ppos=0x0 <fixed_percpu_data>, count=<optimized out>, max=<optimized out>) at fs/read_write.c:1548
#13 0xffffffff813af30b in __do_sys_sendfile64 (count=<optimized out>, offset=<optimized out>, in_fd=<optimized out>, out_fd=<optimized out>) at fs/read_write.c:1609
#14 __se_sys_sendfile64 (count=<optimized out>, offset=<optimized out>, in_fd=<optimized out>, out_fd=<optimized out>) at fs/read_write.c:1595

generic_file_buffered_read核心逻辑:

  • 尝试从page cache(address_space数据结构)中查找,如果命中返回。
  • 如果未在page cache中,readpage从磁盘读取数据到page cache中。
  • 按需(不是所有场景下才会预读)预读(readahead),提升IO性能。

/**
 * generic_file_buffered_read - generic file read routine
 * @iocb:	the iocb to read
 * @iter:	data destination
 * @written:	already copied
 *
 * This is a generic file read routine, and uses the
 * mapping->a_ops->readpage() function for the actual low-level stuff.
 *
 * 本段代码自己注释说逻辑实现的真的“丑陋”....
 * This is really ugly. But the goto's actually try to clarify some
 * of the logic when it comes to error handling etc.
 *
 * Return:
 * * total number of bytes copied, including those the were already @written
 * * negative error code if nothing was copied
 */

ssize_t generic_file_buffered_read(struct kiocb *iocb,
		struct iov_iter *iter, ssize_t written)
{
	struct file *filp = iocb->ki_filp;
	struct address_space *mapping = filp->f_mapping;
	struct inode *inode = mapping->host;
    //预读的数据结构,预读后面会有专门的文章分析
	struct file_ra_state *ra = &filp->f_ra;

    //文件读取位置,即在文件内偏移量,字节为单位
	loff_t *ppos = &iocb->ki_pos;
	pgoff_t index;
	pgoff_t last_index;
	pgoff_t prev_index;
	unsigned long offset;      /* offset into pagecache page */
	unsigned int prev_offset;
	int error = 0;

	if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
		return 0;
	iov_iter_truncate(iter, inode->i_sb->s_maxbytes);

    //把文档按4K分成数据页,index代表是数据页索引
    //|----index 0----|----index 1----|----index 2----|
    //       4K             4K              4K
	index = *ppos >> PAGE_SHIFT;

    //上次预读的位置
	prev_index = ra->prev_pos >> PAGE_SHIFT;
	prev_offset = ra->prev_pos & (PAGE_SIZE-1);

    //iter->count 要读取得字节数,last_index是要读取得最后一个index
	last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
	offset = *ppos & ~PAGE_MASK;

    //循环读取iter->count bytes数据(即应用程序要求读取的数据)
	for (;;) {
		struct page *page;
		pgoff_t end_index;
		loff_t isize;
		unsigned long nr, ret;

		cond_resched();
find_page:
		if (fatal_signal_pending(current)) {
			error = -EINTR;
			goto out;
		}
        //mapping指向的address_space缓存中,根据index查找是否已经有缓存数据
		page = find_get_page(mapping, index);
		if (!page) {
            //如果执行了不进行时机的磁盘io操作(必须在cache中)返回错误
			if (iocb->ki_flags & IOCB_NOIO)
				goto would_block;
            //"同步预读",注意,这里同步并非真的同步等待,本质上也是向block layer
            //提交一个io请求就返回了
			page_cache_sync_readahead(mapping,
					ra, filp,
					index, last_index - index);
			page = find_get_page(mapping, index);

            //如果没有成功(因为sync_readahead给page申请内存时,不会进入慢路径,所以
            //有可能失败。失败后goto no_cache_page:分配page页面,并readpage进行磁盘io操作
			if (unlikely(page == NULL))
				goto no_cached_page;
		}

        //走到这里,代表readahead成功,也就是当前想读取的页面,已经被预读了
        //理解这句话要看readahead机制和数据结构,PageReadahead page是怎么设置的
		if (PageReadahead(page)) {
			if (iocb->ki_flags & IOCB_NOIO) {
				put_page(page);
				goto out;
			}
            //触发一次异步read_ahead,因为异步读成功命中,重新取预读更多的页面,也会
            //更新ra数据结构
			page_cache_async_readahead(mapping,
					ra, filp, page,
					index, last_index - index);
		}

        //如果页面数据无效,比如系统I/O非常繁忙,上面异步读没有完成,这里就需要等待了
		if (!PageUptodate(page)) {
			/*
			 * See comment in do_read_cache_page on why
			 * wait_on_page_locked is used to avoid unnecessarily
			 * serialisations and why it's safe.
			 */
            //IOCB_WAITQ代表异步io,比如io_uring
			if (iocb->ki_flags & IOCB_WAITQ) {
				if (written) {
					put_page(page);
					goto out;
				}
				error = wait_on_page_locked_async(page,
								iocb->ki_waitq);
			} else {
                //指定不等待,就返回错误
				if (iocb->ki_flags & IOCB_NOWAIT) {
					put_page(page);
					goto would_block;
				}
                //block等待IO完成,systrace上显示进程block i/o就是等在此处。
				error = wait_on_page_locked_killable(page);
			}
			if (unlikely(error))
				goto readpage_error;

            //上面的等待成功了
			if (PageUptodate(page))
				goto page_ok;

			if (inode->i_blkbits == PAGE_SHIFT ||
					!mapping->a_ops->is_partially_uptodate)
				goto page_not_up_to_date;
			/* pipes can't handle partially uptodate pages */
			if (unlikely(iov_iter_is_pipe(iter)))
				goto page_not_up_to_date;
			if (!trylock_page(page))
				goto page_not_up_to_date;
			/* Did it get truncated before we got the lock? */
			if (!page->mapping)
				goto page_not_up_to_date_locked;
			if (!mapping->a_ops->is_partially_uptodate(page,
							offset, iter->count))
				goto page_not_up_to_date_locked;
			unlock_page(page);
		}
page_ok:
		/*
		 * i_size must be checked after we know the page is Uptodate.
		 *
		 * Checking i_size after the check allows us to calculate
		 * the correct value for "nr", which means the zero-filled
		 * part of the page is not copied back to userspace (unless
		 * another truncate extends the file - this is desired though).
		 */

		isize = i_size_read(inode);
		end_index = (isize - 1) >> PAGE_SHIFT;
		if (unlikely(!isize || index > end_index)) {
			put_page(page);
			goto out;
		}

		/* nr is the maximum number of bytes to copy from this page */
		nr = PAGE_SIZE;
		if (index == end_index) {
			nr = ((isize - 1) & ~PAGE_MASK) + 1;
			if (nr <= offset) {
				put_page(page);
				goto out;
			}
		}
		nr = nr - offset;

		/* If users can be writing to this page using arbitrary
		 * virtual addresses, take care about potential aliasing
		 * before reading the page on the kernel side.
		 */
		if (mapping_writably_mapped(mapping))
			flush_dcache_page(page);

		/*
		 * When a sequential read accesses a page several times,
		 * only mark it as accessed the first time.
		 */
		if (prev_index != index || offset != prev_offset)
			mark_page_accessed(page);
		prev_index = index;

		/*
		 * Ok, we have the page, and it's up-to-date, so
		 * now we can copy it to user space...
		 */

        //数据copy给用户空间,iter->count会减去copy的数据大小
		ret = copy_page_to_iter(page, offset, nr, iter);
		offset += ret;
		index += offset >> PAGE_SHIFT;
		offset &= ~PAGE_MASK;
		prev_offset = offset;

		put_page(page);
		written += ret;
        //如果iter->count = 0,即已经完成数据读取goto out跳出循环。
		if (!iov_iter_count(iter))
			goto out;
		if (ret < nr) {
			error = -EFAULT;
			goto out;
		}
		continue;

page_not_up_to_date:
		/* Get exclusive access to the page ... */
		if (iocb->ki_flags & IOCB_WAITQ)
			error = lock_page_async(page, iocb->ki_waitq);
		else
			error = lock_page_killable(page);
		if (unlikely(error))
			goto readpage_error;

page_not_up_to_date_locked:
		/* Did it get truncated before we got the lock? */
		if (!page->mapping) {
			unlock_page(page);
			put_page(page);
			continue;
		}

		/* Did somebody else fill it already? */
		if (PageUptodate(page)) {
			unlock_page(page);
			goto page_ok;
		}

readpage:
        //low-level page read,即调用readpage触发磁盘i/o
		if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) {
			unlock_page(page);
			put_page(page);
			goto would_block;
		}
		/*
		 * A previous I/O error may have been due to temporary
		 * failures, eg. multipath errors.
		 * PG_error will be set again if readpage fails.
		 */
		ClearPageError(page);
		/* Start the actual read. The read will unlock the page. */
		error = mapping->a_ops->readpage(filp, page);

		if (unlikely(error)) {
			if (error == AOP_TRUNCATED_PAGE) {
				put_page(page);
				error = 0;
				goto find_page;
			}
			goto readpage_error;
		}

		if (!PageUptodate(page)) {
			if (iocb->ki_flags & IOCB_WAITQ)
				error = lock_page_async(page, iocb->ki_waitq);
			else
				error = lock_page_killable(page);

			if (unlikely(error))
				goto readpage_error;
			if (!PageUptodate(page)) {
				if (page->mapping == NULL) {
					/*
					 * invalidate_mapping_pages got it
					 */
					unlock_page(page);
					put_page(page);
					goto find_page;
				}
				unlock_page(page);
				shrink_readahead_size_eio(ra);
				error = -EIO;
				goto readpage_error;
			}
			unlock_page(page);
		}

		goto page_ok;

readpage_error:
		/* UHHUH! A synchronous read error occurred. Report it */
		put_page(page);
		goto out;

no_cached_page:
        //address space没有缓存页面,alloc page,然后插入cache中。
		/*
		 * Ok, it wasn't cached, so we need to create a new
		 * page..
		 */
		page = page_cache_alloc(mapping);
		if (!page) {
			error = -ENOMEM;
			goto out;
		}
		error = add_to_page_cache_lru(page, mapping, index,
				mapping_gfp_constraint(mapping, GFP_KERNEL));
		if (error) {
			put_page(page);
			if (error == -EEXIST) {
				error = 0;
				goto find_page;
			}
			goto out;
		}
		goto readpage;
	}

would_block:
	error = -EAGAIN;
out:
	ra->prev_pos = prev_index;
	ra->prev_pos <<= PAGE_SHIFT;
	ra->prev_pos |= prev_offset;

	*ppos = ((loff_t)index << PAGE_SHIFT) + offset;
	file_accessed(filp);
	return written ? written : error;
}

重点提示:代码中为什么先触发一次sync_readahead,再触发一次async_readahead?

  • 如果代码注释的一样,这两个函数命令很容易误解,其实这两个从应用调用的角度来看,都是”异步"的,以为他们都只是向block layer submit io,所以是异步,从代码上下文也能推测出来,比如sync_readahead调用完之后,紧接调用了find_get_page还是判断了page == NULL的情况,从这点就充分看出来sync_readahead也是异步调用。
  • 如果系统io情况正常,100%顺序读取的情况下,sync_readahead调用一次,触发了预读,后面再顺序读就不会触发sync_readhead了,而是触发async_readahead,async_readahead触发的条件是要读取的page满足PageReadahead(page),即预读页面被命中了。就像网络协议栈中的滑动窗口,既然请求已经来到预读的窗口中,那么就要重新触发新的预读,预读更多的页面。

具体上面这段话不理解的话,参考我后面readahead算法介绍的文章。

wait_on_page_locked_killable返回时机

IO完成之前会一直等待IO完成,在IO完成后,块设备通过中断通知cpu。在中断处理函数中,会进一步触发BLOCK_SOFTIRQ。在软中断处理例程中,回调最终会触发bio->bi_end_io(对ext4来说是mpage_end_io,解锁(unlock_page)之前在锁定的页面,同时调用SetPageUptodate设置update状态,调用栈如下:

#0  SetPageUptodate (page=<optimized out>) at ./include/linux/page-flags.h:542
#1  __read_end_io (bio=0xffff88800607be40) at fs/ext4/readpage.c:85
#2  0xffffffff8150ed00 in mpage_end_io (bio=0xffff88800607be40) at fs/ext4/readpage.c:183
#3  0xffffffff8168859f in bio_endio (bio=0xffff88800607be40) at block/bio.c:1449
#4  0xffffffff8168f9d7 in req_bio_endio (error=<optimized out>, nbytes=<optimized out>, bio=<optimized out>, rq=<optimized out>) at block/blk-core.c:259
#5  blk_update_request (req=0xffff8880062fe040, error=0 '\000', nr_bytes=131072) at block/blk-core.c:1577
#6  0xffffffff816a2d6a in blk_mq_end_request (rq=0xffff8880062fe040, error=<optimized out>) at ./include/linux/blkdev.h:976
#7  0xffffffff81b780c9 in virtblk_request_done (req=0xffff8880062fe040) at drivers/block/virtio_blk.c:171
#8  0xffffffff8169e6fb in blk_done_softirq (h=<optimized out>) at block/blk-mq.c:586
#9  0xffffffff826000d1 in __do_softirq () at kernel/softirq.c:298
#10 0xffffffff82400f82 in asm_call_on_stack () at arch/x86/entry/entry_64.S:708
#11 0xffffffff810ea498 in __run_on_irqstack (func=<optimized out>) at ./arch/x86/include/asm/irq_stack.h:26

 wait_on_page_locked_killable函数代码

static inline int wait_on_page_locked_killable(struct page *page)                                                                                                        
{
    if (!PageLocked(page))
        return 0;
    return wait_on_page_bit_killable(compound_head(page), PG_locked);
}

/* Convenience macros for the sake of set_current_state: */
#define TASK_KILLABLE			(TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)


int wait_on_page_bit_killable(struct page *page, int bit_nr)                                                                                                             
{
    wait_queue_head_t *q = page_waitqueue(page);
    return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, false);
}

static inline int wait_on_page_bit_common(wait_queue_head_t *q,
        struct page *page, int bit_nr, int state, bool lock)
{
    struct wait_page_queue wait_page;
    wait_queue_entry_t *wait = &wait_page.wait;
    int ret = 0;

    init_wait(wait);
    wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0;
    wait->func = wake_page_function;
    wait_page.page = page;
    wait_page.bit_nr = bit_nr;

    for (;;) {
        spin_lock_irq(&q->lock);

        if (likely(list_empty(&wait->entry))) {
            __add_wait_queue_entry_tail(q, wait);
            SetPageWaiters(page);
        }

        set_current_state(state);

        spin_unlock_irq(&q->lock);

        if (likely(test_bit(bit_nr, &page->flags))) {
            io_schedule();
        }

        if (lock) {
            if (!test_and_set_bit_lock(bit_nr, &page->flags))
                break;
        } else {
            if (!test_bit(bit_nr, &page->flags))
                break;
        }

        if (unlikely(signal_pending_state(state, current))) {
            ret = -EINTR;
            break;
        }
    }

    finish_wait(q, wait);

    /*
     * A signal could leave PageWaiters set. Clearing it here if
     * !waitqueue_active would be possible (by open-coding finish_wait),
     * but still fail to catch it in the case of wait hash collision. We
     * already can fail to clear wait hash collision cases, so don't
     * bother with signals either.
     */

    return ret;
}

IO没有完成前,PG_Locked一直置位,调用进入io_schedule,进程主动让出CPU,状态切入TASK_KILLABLE(同时也是一个TASK_UNINTERRUPTIBLE)。io_schedule会设置current->io_wait = 1,io等待期间,cpu idle期间检测到io_wait就会累计iowait的时间。

IO完成解锁页面,唤醒进程

static void __read_end_io(struct bio *bio)
{
    struct page *page;
    struct bio_vec *bv;
    struct bvec_iter_all iter_all;

    bio_for_each_segment_all(bv, bio, iter_all) {
        page = bv->bv_page;

        /* PG_error was set if any post_read step failed */
        if (bio->bi_status || PageError(page)) {
            ClearPageUptodate(page);
            /* will re-read again later */
            ClearPageError(page);
        } else {
            //设置Uptodate状态
            SetPageUptodate(page);
        }
        //解锁页面,会wakeup等待的线程
        unlock_page(page);                                                                                                                                               
    }
    if (bio->bi_private)
        mempool_free(bio->bi_private, bio_post_read_ctx_pool);
    bio_put(bio);
}

/**
 * unlock_page - unlock a locked page
 * @page: the page
 *
 * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
 * Also wakes sleepers in wait_on_page_writeback() because the wakeup
 * mechanism between PageLocked pages and PageWriteback pages is shared.
 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
 *
 * Note that this depends on PG_waiters being the sign bit in the byte
 * that contains PG_locked - thus the BUILD_BUG_ON(). That allows us to
 * clear the PG_locked bit and test PG_waiters at the same time fairly
 * portably (architectures that do LL/SC can test any bit, while x86 can
 * test the sign bit).
 */
void unlock_page(struct page *page)
{
	BUILD_BUG_ON(PG_waiters != 7);
	page = compound_head(page);
	VM_BUG_ON_PAGE(!PageLocked(page), page);
	if (clear_bit_unlock_is_negative_byte(PG_locked, &page->flags))
		wake_up_page_bit(page, PG_locked);
}
EXPORT_SYMBOL(unlock_page);

mpage_end_io 回调是哪里赋值的 - ext4读文件为例

fs/ext4/readpage.c : ext4_mpage_readpages

int ext4_mpage_readpages(struct address_space *mapping,
			 struct list_head *pages, struct page *page,
			 unsigned nr_pages)
{
    ...
	for (; nr_pages; nr_pages--) {
        ...
		bio_set_dev(bio, bdev);
		bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
		bio->bi_end_io = mpage_end_io;
		bio->bi_private = ctx;
		ext4_set_bio_ctx(inode, bio);
		bio_set_op_attrs(bio, REQ_OP_READ, 0);
        ...
	}
	BUG_ON(pages && !list_empty(pages));
	if (bio)
		ext4_submit_bio_read(bio);
	return 0;
}

参考文章:

Linux 通用块层 bio 详解 – 字节岛技术分享

Life of an ext4 write request - Ext4

深入分析Linux内核File cache机制(上篇) - 知乎

Linux内核中跟踪文件PageCache预读_读取

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
虚拟设备对象并不对应一个实际的物理设备,而是由驱动程序创建的一个逻辑设备。因此,您不能使用CreateFile函数直接打开FILE_DEVICE_VIDEO设备文件。 如果您想要访问虚拟设备对象,您需要使用设备接口来与驱动程序进行通信。这需要您进行驱动程序开发,并编写相应的设备控制代码。 以下是一个简单的示例,用于在用户空间打开虚拟设备对象并向其发送请求: ``` #include <windows.h> #include <winioctl.h> #define MY_DEVICE_TYPE 40000 // 自定义设备类型 #define MY_IOCTL_CODE CTL_CODE(MY_DEVICE_TYPE, 0x800, METHOD_BUFFERED, FILE_ANY_ACCESS) int main() { HANDLE hDevice; DWORD dwBytesReturned; char buffer[1024]; hDevice = CreateFile(L"\\\\.\\MyVideoDevice", GENERIC_READ | GENERIC_WRITE, 0, NULL, OPEN_EXISTING, 0, NULL); if (hDevice == INVALID_HANDLE_VALUE) { printf("Failed to open device, error code: %d\n", GetLastError()); return -1; } // 向设备发送请求 DeviceIoControl(hDevice, MY_IOCTL_CODE, NULL, 0, buffer, sizeof(buffer), &dwBytesReturned, NULL); CloseHandle(hDevice); return 0; } ``` 在上面的示例中,我们使用CreateFile函数打开名为\\.\MyVideoDevice的设备接口,并向其发送一个自定义的设备控制请求。 需要注意的是,这只是一个简单的示例,实际情况下,您需要根据您的具体需求编写相应的设备控制代码,以实现您的业务逻辑。同时,也需要注意保证您的代码安全可靠,避免造成系统崩溃等严重后果。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值