/**
* __get_user_pages() - pin user pages in memory
* @tsk: task_struct of target task
* @mm: mm_struct of target mm
* @start: starting user address
* @nr_pages: number of pages from start to pin
* @gup_flags: flags modifying pin behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long. Or NULL, if caller
* only intends to ensure the pages are faulted in.
* @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them.
* @nonblocking: whether waiting for disk IO or mmap_sem contention
*
* Returns number of pages pinned. This may be fewer than the number
* requested. If nr_pages is 0 or negative, returns 0. If no pages
* were pinned, returns -errno. Each page returned must be released
* with a put_page() call when it is finished with. vmas will only
* remain valid while mmap_sem is held.
*
* Must be called with mmap_sem held. It may be released. See below.
*
* __get_user_pages walks a process's page tables and takes a reference to
* each struct page that each user address corresponds to at a given
* instant. That is, it takes the page that would be accessed if a user
* thread accesses the given user virtual address at that instant.
*
* This does not guarantee that the page exists in the user mappings when
* __get_user_pages returns, and there may even be a completely different
* page there in some cases (eg. if mmapped pagecache has been invalidated
* and subsequently re faulted). However it does guarantee that the page
* won't be freed completely. And mostly callers simply care that the page
* contains data that was valid *at some point in time*. Typically, an IO
* or similar operation cannot guarantee anything stronger anyway because
* locks can't be held over the syscall boundary.
*
* If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
* the page is written to, set_page_dirty (or set_page_dirty_lock, as
* appropriate) must be called after the page is finished with, and
* before put_page is called.
*
* If @nonblocking != NULL, __get_user_pages will not wait for disk IO
* or mmap_sem contention, and if waiting is needed to pin all pages,
* *@nonblocking will be set to 0. Further, if @gup_flags does not
* include FOLL_NOWAIT, the mmap_sem will be released via up_read() in
* this case.
*
* A caller using such a combination of @nonblocking and @gup_flags
* must therefore hold the mmap_sem for reading only, and recognize
* when it's been released. Otherwise, it must be held for either
* reading or writing and will not be released.
*
* In most cases, get_user_pages or get_user_pages_fast should be used
* instead of __get_user_pages. __get_user_pages should be used only if
* you need some special @gup_flags.
*/
static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *nonblocking)
{
long i = 0;
unsigned int page_mask;
struct vm_area_struct *vma = NULL;
if (!nr_pages)
return 0;
VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
/*
* If FOLL_FORCE is set then do not force a full fault as the hinting
* fault information is unrelated to the reference behaviour of a task
* using the address space
*/
if (!(gup_flags & FOLL_FORCE))
gup_flags |= FOLL_NUMA;
do {
struct page *page;
unsigned int foll_flags = gup_flags;
unsigned int page_increm;
/* first iteration or cross vma bound */
if (!vma || start >= vma->vm_end) {
vma = find_extend_vma(mm, start);
if (!vma && in_gate_area(mm, start)) {
int ret;
ret = get_gate_page(mm, start & PAGE_MASK,
gup_flags, &vma,
pages ? &pages[i] : NULL);
if (ret)
return i ? : ret;
page_mask = 0;
goto next_page;
}
if (!vma || check_vma_flags(vma, gup_flags))
return i ? : -EFAULT;
if (is_vm_hugetlb_page(vma)) {
i = follow_hugetlb_page(mm, vma, pages, vmas,
&start, &nr_pages, i,
gup_flags);
continue;
}
}
retry:
/*
* If we have a pending SIGKILL, don't keep faulting pages and
* potentially allocating memory.
*/
if (unlikely(fatal_signal_pending(current)))
return i ? i : -ERESTARTSYS;
cond_resched();
page = follow_page_mask(vma, start, foll_flags, &page_mask);
if (!page) {
int ret;
ret = faultin_page(tsk, vma, start, &foll_flags,
nonblocking);
switch (ret) {
case 0:
goto retry;
case -EFAULT:
case -ENOMEM:
case -EHWPOISON:
return i ? i : ret;
case -EBUSY:
return i;
case -ENOENT:
goto next_page;
}
BUG();
} else if (PTR_ERR(page) == -EEXIST) {
/*
* Proper page table entry exists, but no corresponding
* struct page.
*/
goto next_page;
} else if (IS_ERR(page)) {
return i ? i : PTR_ERR(page);
}
if (pages) {
pages[i] = page;
flush_anon_page(vma, page, start);
flush_dcache_page(page);
page_mask = 0;
}
next_page:
if (vmas) {
vmas[i] = vma;
page_mask = 0;
}
page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
if (page_increm > nr_pages)
page_increm = nr_pages;
i += page_increm;
start += page_increm * PAGE_SIZE;
nr_pages -= page_increm;
} while (nr_pages);
return i;
}
__get_user_pages() - pin user pages in memory 表明 使用该函数可以锁住物理页,防止其被swap出去或者被migrate 。
static inline void get_page(struct page *page)
{
page = compound_head(page);
/*
* Getting a normal page or the head of a compound page
* requires to already have an elevated page->_refcount.
*/
VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page);
page_ref_inc(page);
if (unlikely(is_zone_device_page(page)))
get_zone_device_page(page);
}
参考:https://zhuanlan.zhihu.com/p/28742793
Linux内核提供的pin机制是gup(参考mm/gup.c),这是get_user_pages系列函数的缩写。它的作用是把某个进程的虚拟地址进行预缺页,把物理页表分配出来,然后做get_page(),增加这个page的使用计数,这样这个page就不会被释放,设备就可以一直访问这个物理页了。
以 v4l2 使用说明
/**
* vb2_create_framevec() - map virtual addresses to pfns
* @start: Virtual user address where we start mapping
* @length: Length of a range to map
* @write: Should we map for writing into the area
*
* This function allocates and fills in a vector with pfns corresponding to
* virtual address range passed in arguments. If pfns have corresponding pages,
* page references are also grabbed to pin pages in memory. The function
* returns pointer to the vector on success and error pointer in case of
* failure. Returned vector needs to be freed via vb2_destroy_pfnvec().
*/
struct frame_vector *vb2_create_framevec(unsigned long start,
unsigned long length,
bool write)
{
int ret;
unsigned long first, last;
unsigned long nr;
struct frame_vector *vec;
unsigned int flags = FOLL_FORCE;
if (write)
flags |= FOLL_WRITE;
first = start >> PAGE_SHIFT;
last = (start + length - 1) >> PAGE_SHIFT;
nr = last - first + 1;
vec = frame_vector_create(nr);
if (!vec)
return ERR_PTR(-ENOMEM);
ret = get_vaddr_frames(start & PAGE_MASK, nr, flags, vec);
if (ret < 0)
goto out_destroy;
/* We accept only complete set of PFNs */
if (ret != nr) {
ret = -EFAULT;
goto out_release;
}
return vec;
out_release:
put_vaddr_frames(vec);
out_destroy:
frame_vector_destroy(vec);
return ERR_PTR(ret);
}
EXPORT_SYMBOL(vb2_create_framevec);
/**
* get_vaddr_frames() - map virtual addresses to pfns
* @start: starting user address
* @nr_frames: number of pages / pfns from start to map
* @gup_flags: flags modifying lookup behaviour
* @vec: structure which receives pages / pfns of the addresses mapped.
* It should have space for at least nr_frames entries.
*
* This function maps virtual addresses from @start and fills @vec structure
* with page frame numbers or page pointers to corresponding pages (choice
* depends on the type of the vma underlying the virtual address). If @start
* belongs to a normal vma, the function grabs reference to each of the pages
* to pin them in memory. If @start belongs to VM_IO | VM_PFNMAP vma, we don't
* touch page structures and the caller must make sure pfns aren't reused for
* anything else while he is using them.
*
* The function returns number of pages mapped which may be less than
* @nr_frames. In particular we stop mapping if there are more vmas of
* different type underlying the specified range of virtual addresses.
* When the function isn't able to map a single page, it returns error.
*
* This function takes care of grabbing mmap_sem as necessary.
*/
int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
unsigned int gup_flags, struct frame_vector *vec)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
int ret = 0;
int err;
int locked;
if (nr_frames == 0)
return 0;
if (WARN_ON_ONCE(nr_frames > vec->nr_allocated))
nr_frames = vec->nr_allocated;
down_read(&mm->mmap_sem);
locked = 1;
vma = find_vma_intersection(mm, start, start + 1);
if (!vma) {
ret = -EFAULT;
goto out;
}
/*
* While get_vaddr_frames() could be used for transient (kernel
* controlled lifetime) pinning of memory pages all current
* users establish long term (userspace controlled lifetime)
* page pinning. Treat get_vaddr_frames() like
* get_user_pages_longterm() and disallow it for filesystem-dax
* mappings.
*/
if (vma_is_fsdax(vma)) {
ret = -EOPNOTSUPP;
goto out;
}
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) {
vec->got_ref = true;
vec->is_pfns = false;
ret = get_user_pages_locked(start, nr_frames,
gup_flags, (struct page **)(vec->ptrs), &locked);
goto out;
}
vec->got_ref = false;
vec->is_pfns = true;
do {
unsigned long *nums = frame_vector_pfns(vec);
while (ret < nr_frames && start + PAGE_SIZE <= vma->vm_end) {
err = follow_pfn(vma, start, &nums[ret]);
if (err) {
if (ret == 0)
ret = err;
goto out;
}
start += PAGE_SIZE;
ret++;
}
/*
* We stop if we have enough pages or if VMA doesn't completely
* cover the tail page.
*/
if (ret >= nr_frames || start < vma->vm_end)
break;
vma = find_vma_intersection(mm, start, start + 1);
} while (vma && vma->vm_flags & (VM_IO | VM_PFNMAP));
out:
if (locked)
up_read(&mm->mmap_sem);
if (!ret)
ret = -EFAULT;
if (ret > 0)
vec->nr_frames = ret;
return ret;
}
EXPORT_SYMBOL(get_vaddr_frames);
/*
* We can leverage the VM_FAULT_RETRY functionality in the page fault
* paths better by using either get_user_pages_locked() or
* get_user_pages_unlocked().
*
* get_user_pages_locked() is suitable to replace the form:
*
* down_read(&mm->mmap_sem);
* do_something()
* get_user_pages(tsk, mm, ..., pages, NULL);
* up_read(&mm->mmap_sem);
*
* to:
*
* int locked = 1;
* down_read(&mm->mmap_sem);
* do_something()
* get_user_pages_locked(tsk, mm, ..., pages, &locked);
* if (locked)
* up_read(&mm->mmap_sem);
*/
long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
int *locked)
{
return __get_user_pages_locked(current, current->mm, start, nr_pages,
pages, NULL, locked, true,
gup_flags | FOLL_TOUCH);
}
EXPORT_SYMBOL(get_user_pages_locked);
__get_user_pages_locked 会调用到 __get_user_pages,把V4L2使用的这些page在内核pin住。不会出现 migrate page,导致dma 出现buffer 异常。