透明大页(transparent hugepages)和普通大页的区别是,透明大页 可以实时配置,不需要重启配置就可以生效,第二个是,利用khugepage thread 分配,不用在开机的时候预留,第三是透明大页 不再依赖hugetlbfs。透明大页映射的时机有两个。
要使用透明大页,就需要想之前一样通过通过 mount hugetlbfs。来实现,而是通过user space使用的时候调用fadvise()/madvise()的时候设定advise/within_size
第一个:handle_mm_fault->__handle_mm_fault
static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags)
{
struct fault_env fe = {
.vma = vma,
.address = address,
.flags = flags,
};
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
pud_t *pud;
pgd = pgd_offset(mm, address);
pud = pud_alloc(mm, pgd, address);
if (!pud)
return VM_FAULT_OOM;
fe.pmd = pmd_alloc(mm, pud, address);
if (!fe.pmd)
return VM_FAULT_OOM;
if (pmd_none(*fe.pmd) && transparent_hugepage_enabled(vma)) {
int ret = create_huge_pmd(&fe);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
pmd_t orig_pmd = *fe.pmd;
int ret;
barrier();
if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
return do_huge_pmd_numa_page(&fe, orig_pmd);
if ((fe.flags & FAULT_FLAG_WRITE) &&
!pmd_write(orig_pmd)) {
ret = wp_huge_pmd(&fe, orig_pmd);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
huge_pmd_set_accessed(&fe, orig_pmd);
return 0;
}
}
}
return handle_pte_fault(&fe);
}
在这个函数中就通过transparent_hugepage_enabled 来判断是否是能了透明大页,如果使能了透明大页,就调用create_huge_pmd 来申请pmd。
第二个:相对第一个是在缺页fault的时候发生,第二个则是有一个专门的守护进程在将4k page转成2M等大页.
这个thread执行的主体如下:
static int khugepaged(void *none)
{
struct mm_slot *mm_slot;
set_freezable();
set_user_nice(current, MAX_NICE);
while (!kthread_should_stop()) {
khugepaged_do_scan();
khugepaged_wait_work();
}
spin_lock(&khugepaged_mm_lock);
mm_slot = khugepaged_scan.mm_slot;
khugepaged_scan.mm_slot = NULL;
if (mm_slot)
collect_mm_slot(mm_slot);
spin_unlock(&khugepaged_mm_lock);
return 0;
}
khugepaged_do_scan->khugepaged_scan_mm_slot
static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
struct page **hpage)
__releases(&khugepaged_mm_lock)
__acquires(&khugepaged_mm_lock)
{
struct mm_slot *mm_slot;
struct mm_struct *mm;
struct vm_area_struct *vma;
int progress = 0;
VM_BUG_ON(!pages);
VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
if (khugepaged_scan.mm_slot)
mm_slot = khugepaged_scan.mm_slot;
else {
mm_slot = list_entry(khugepaged_scan.mm_head.next,
struct mm_slot, mm_node);
khugepaged_scan.address = 0;
khugepaged_scan.mm_slot = mm_slot;
}
spin_unlock(&khugepaged_mm_lock);
mm = mm_slot->mm;
down_read(&mm->mmap_sem);
if (unlikely(khugepaged_test_exit(mm)))
vma = NULL;
else
vma = find_vma(mm, khugepaged_scan.address);
progress++;
for (; vma; vma = vma->vm_next) {
unsigned long hstart, hend;
cond_resched();
if (unlikely(khugepaged_test_exit(mm))) {
progress++;
break;
}
if (!hugepage_vma_check(vma)) {
skip:
progress++;
continue;
}
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (hstart >= hend)
goto skip;
if (khugepaged_scan.address > hend)
goto skip;
if (khugepaged_scan.address < hstart)
khugepaged_scan.address = hstart;
VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
while (khugepaged_scan.address < hend) {
int ret;
cond_resched();
if (unlikely(khugepaged_test_exit(mm)))
goto breakouterloop;
VM_BUG_ON(khugepaged_scan.address < hstart ||
khugepaged_scan.address + HPAGE_PMD_SIZE >
hend);
if (shmem_file(vma->vm_file)) {
struct file *file;
pgoff_t pgoff = linear_page_index(vma,
khugepaged_scan.address);
if (!shmem_huge_enabled(vma))
goto skip;
file = get_file(vma->vm_file);
up_read(&mm->mmap_sem);
ret = 1;
khugepaged_scan_shmem(mm, file->f_mapping,
pgoff, hpage);
fput(file);
} else {
ret = khugepaged_scan_pmd(mm, vma,
khugepaged_scan.address,
hpage);
}
/* move to next address */
khugepaged_scan.address += HPAGE_PMD_SIZE;
progress += HPAGE_PMD_NR;
if (ret)
/* we released mmap_sem so break loop */
goto breakouterloop_mmap_sem;
if (progress >= pages)
goto breakouterloop;
}
}
}
在这个函数中通过for循环通过hugepage_vma_check来检查是否要将4Kpage的页转成大页.
要使用透明大页,就需要想之前一样通过通过 mount hugetlbfs。来实现,而是通过user space使用的时候调用fadvise()/madvise()的时候设定advise/within_size
第一个:handle_mm_fault->__handle_mm_fault
static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags)
{
struct fault_env fe = {
.vma = vma,
.address = address,
.flags = flags,
};
struct mm_struct *mm = vma->vm_mm;
pgd_t *pgd;
pud_t *pud;
pgd = pgd_offset(mm, address);
pud = pud_alloc(mm, pgd, address);
if (!pud)
return VM_FAULT_OOM;
fe.pmd = pmd_alloc(mm, pud, address);
if (!fe.pmd)
return VM_FAULT_OOM;
if (pmd_none(*fe.pmd) && transparent_hugepage_enabled(vma)) {
int ret = create_huge_pmd(&fe);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
pmd_t orig_pmd = *fe.pmd;
int ret;
barrier();
if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
return do_huge_pmd_numa_page(&fe, orig_pmd);
if ((fe.flags & FAULT_FLAG_WRITE) &&
!pmd_write(orig_pmd)) {
ret = wp_huge_pmd(&fe, orig_pmd);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
huge_pmd_set_accessed(&fe, orig_pmd);
return 0;
}
}
}
return handle_pte_fault(&fe);
}
在这个函数中就通过transparent_hugepage_enabled 来判断是否是能了透明大页,如果使能了透明大页,就调用create_huge_pmd 来申请pmd。
第二个:相对第一个是在缺页fault的时候发生,第二个则是有一个专门的守护进程在将4k page转成2M等大页.
这个thread执行的主体如下:
static int khugepaged(void *none)
{
struct mm_slot *mm_slot;
set_freezable();
set_user_nice(current, MAX_NICE);
while (!kthread_should_stop()) {
khugepaged_do_scan();
khugepaged_wait_work();
}
spin_lock(&khugepaged_mm_lock);
mm_slot = khugepaged_scan.mm_slot;
khugepaged_scan.mm_slot = NULL;
if (mm_slot)
collect_mm_slot(mm_slot);
spin_unlock(&khugepaged_mm_lock);
return 0;
}
khugepaged_do_scan->khugepaged_scan_mm_slot
static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
struct page **hpage)
__releases(&khugepaged_mm_lock)
__acquires(&khugepaged_mm_lock)
{
struct mm_slot *mm_slot;
struct mm_struct *mm;
struct vm_area_struct *vma;
int progress = 0;
VM_BUG_ON(!pages);
VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
if (khugepaged_scan.mm_slot)
mm_slot = khugepaged_scan.mm_slot;
else {
mm_slot = list_entry(khugepaged_scan.mm_head.next,
struct mm_slot, mm_node);
khugepaged_scan.address = 0;
khugepaged_scan.mm_slot = mm_slot;
}
spin_unlock(&khugepaged_mm_lock);
mm = mm_slot->mm;
down_read(&mm->mmap_sem);
if (unlikely(khugepaged_test_exit(mm)))
vma = NULL;
else
vma = find_vma(mm, khugepaged_scan.address);
progress++;
for (; vma; vma = vma->vm_next) {
unsigned long hstart, hend;
cond_resched();
if (unlikely(khugepaged_test_exit(mm))) {
progress++;
break;
}
if (!hugepage_vma_check(vma)) {
skip:
progress++;
continue;
}
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (hstart >= hend)
goto skip;
if (khugepaged_scan.address > hend)
goto skip;
if (khugepaged_scan.address < hstart)
khugepaged_scan.address = hstart;
VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
while (khugepaged_scan.address < hend) {
int ret;
cond_resched();
if (unlikely(khugepaged_test_exit(mm)))
goto breakouterloop;
VM_BUG_ON(khugepaged_scan.address < hstart ||
khugepaged_scan.address + HPAGE_PMD_SIZE >
hend);
if (shmem_file(vma->vm_file)) {
struct file *file;
pgoff_t pgoff = linear_page_index(vma,
khugepaged_scan.address);
if (!shmem_huge_enabled(vma))
goto skip;
file = get_file(vma->vm_file);
up_read(&mm->mmap_sem);
ret = 1;
khugepaged_scan_shmem(mm, file->f_mapping,
pgoff, hpage);
fput(file);
} else {
ret = khugepaged_scan_pmd(mm, vma,
khugepaged_scan.address,
hpage);
}
/* move to next address */
khugepaged_scan.address += HPAGE_PMD_SIZE;
progress += HPAGE_PMD_NR;
if (ret)
/* we released mmap_sem so break loop */
goto breakouterloop_mmap_sem;
if (progress >= pages)
goto breakouterloop;
}
}
}
在这个函数中通过for循环通过hugepage_vma_check来检查是否要将4Kpage的页转成大页.