在进程中用户态地址如何得到物理地址

在3.9内核里drivers/staging/tidspbridge/core/tiomap3430.c中发现一个有意思的函数:

/*
 *  ======== user_va2_pa ========
 *  Purpose:
 *      This function walks through the page tables to convert a userland
 *      virtual address to physical address
 */
static u32 user_va2_pa(struct mm_struct *mm, u32 address)
{
 pgd_t *pgd;
 pud_t *pud;
 pmd_t *pmd;
 pte_t *ptep, pte;

 pgd = pgd_offset(mm, address);
 if (pgd_none(*pgd) || pgd_bad(*pgd))
  return 0;

 pud = pud_offset(pgd, address);
 if (pud_none(*pud) || pud_bad(*pud))
  return 0;

 pmd = pmd_offset(pud, address);
 if (pmd_none(*pmd) || pmd_bad(*pmd))
  return 0;

 ptep = pte_offset_map(pmd, address);
 if (ptep) {
  pte = *ptep;
  if (pte_present(pte))
   return pte & PAGE_MASK;
 }

 return 0;
}

这个从进程的PGD一级页表开始,遍历各级页表,根据指定用户态虚拟地址得到物理地址。

分析:

对于32位ARM来说:

arch/arm/include/asm/pgtable.h中定义了

/* to find an entry in a page-table-directory */
#define pgd_index(addr)  ((addr) >> PGDIR_SHIFT)

#define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))

而在arch/arm/include/asm/pgtable-2level.h中定义了

/*
 * PMD_SHIFT determines the size of the area a second-level page table can map
 * PGDIR_SHIFT determines what a third-level page table entry can map
 */
#define PMD_SHIFT  21
#define PGDIR_SHIFT  21

这里ARM的一级页表经过改造了,不是标准的硬件要求的20位SHIFT,即以1MB为单位的一级页表,这里实际上是2MB为单位的一级页表。

#define PTRS_PER_PTE  512
#define PTRS_PER_PMD  1
#define PTRS_PER_PGD  2048

而相应的一级和二级页表项也已经改成了2048和512个表项了,而不是原始的4096和256个表项了,从头文件的注释来看,主要ARM的硬件页表不支持“页脏”和“页young”属性,需要用Linux页表辅助ARM的硬件页表来完全支持linux系统。如果页大小是4KB的话,一个PAGE正好可以放1024个表项,把它分成上“半”页和下“半”页,各512个表项,上“半”页放的是linux页表项,而下“半”页放的是硬件页表项。所以我们会看到linux创建ARM页表时,填充完linux页表后,会加上2048个字节的偏移(半个页大小),然后填充硬件页表项。同时请参考代码pgtable-2level.h(arch/arm/include/asm/)中的注释,画的比较清晰。

/*
 * Hardware-wise, we have a two level page table structure, where the first
 * level has 4096 entries, and the second level has 256 entries.  Each entry
 * is one 32-bit word.  Most of the bits in the second level entry are used
 * by hardware, and there aren't any "accessed" and "dirty" bits.
 *
 * Linux on the other hand has a three level page table structure, which can
 * be wrapped to fit a two level page table structure easily - using the PGD
 * and PTE only.  However, Linux also expects one "PTE" table per page, and
 * at least a "dirty" bit.
 *
 * Therefore, we tweak the implementation slightly - we tell Linux that we
 * have 2048 entries in the first level, each of which is 8 bytes (iow, two
 * hardware pointers to the second level.)  The second level contains two
 * hardware PTE tables arranged contiguously, preceded by Linux versions
 * which contain the state information Linux needs.  We, therefore, end up
 * with 512 entries in the "PTE" level.
 *
 * This leads to the page tables having the following layout:
 *
 *    pgd             pte
 * |        |
 * +--------+
 * |        |       +------------+ +0
 * +- - - - +       | Linux pt 0 |
 * |        |       +------------+ +1024
 * +--------+ +0    | Linux pt 1 |
 * |        |-----> +------------+ +2048
 * +- - - - + +4    |  h/w pt 0  |
 * |        |-----> +------------+ +3072
 * +--------+ +8    |  h/w pt 1  |
 * |        |       +------------+ +4096
 *
 * See L_PTE_xxx below for definitions of bits in the "Linux pt", and
 * PTE_xxx for definitions of bits appearing in the "h/w pt".
 *
 * PMD_xxx definitions refer to bits in the first level page table.
 *
 * The "dirty" bit is emulated by only granting hardware write permission
 * iff the page is marked "writable" and "dirty" in the Linux PTE.  This
 * means that a write to a clean page will cause a permission fault, and
 * the Linux MM layer will mark the page dirty via handle_pte_fault().
 * For the hardware to notice the permission change, the TLB entry must
 * be flushed, and ptep_set_access_flags() does that for us.
 *
 * The "accessed" or "young" bit is emulated by a similar method; we only
 * allow accesses to the page if the "young" bit is set.  Accesses to the
 * page will cause a fault, and handle_pte_fault() will set the young bit
 * for us as long as the page is marked present in the corresponding Linux
 * PTE entry.  Again, ptep_set_access_flags() will ensure that the TLB is
 * up to date.
 *
 * However, when the "young" bit is cleared, we deny access to the page
 * by clearing the hardware PTE.  Currently Linux does not flush the TLB
 * for us in this case, which means the TLB will retain the transation
 * until either the TLB entry is evicted under pressure, or a context
 * switch which changes the user space mapping occurs.
 */
#define PTRS_PER_PTE  512
#define PTRS_PER_PMD  1
#define PTRS_PER_PGD  2048

 

我们以cortex-A9为例,我们跟踪create_mapping(arch/arm/mm/mmu.c)函数中,最终填充页表项函数会调用:

/*
 * cpu_v7_set_pte_ext(ptep, pte)
 *
 * Set a level 2 translation table entry.
 *
 * - ptep  - pointer to level 2 translation table entry
 *    (hardware version is stored at +2048 bytes)
 * - pte   - PTE value to store
 * - ext - value for extended PTE bits
 */
ENTRY(cpu_v7_set_pte_ext)
#ifdef CONFIG_MMU
 str r1, [r0]   @ linux version

 bic r3, r1, #0x000003f0
 bic r3, r3, #PTE_TYPE_MASK
 orr r3, r3, r2
 orr r3, r3, #PTE_EXT_AP0 | 2

 tst r1, #1 << 4
 orrne r3, r3, #PTE_EXT_TEX(1)

 eor r1, r1, #L_PTE_DIRTY
 tst r1, #L_PTE_RDONLY | L_PTE_DIRTY
 orrne r3, r3, #PTE_EXT_APX

 tst r1, #L_PTE_USER
 orrne r3, r3, #PTE_EXT_AP1
#ifdef CONFIG_CPU_USE_DOMAINS
 @ allow kernel read/write access to read-only user pages
 tstne r3, #PTE_EXT_APX
 bicne r3, r3, #PTE_EXT_APX | PTE_EXT_AP0
#endif

 tst r1, #L_PTE_XN
 orrne r3, r3, #PTE_EXT_XN

 tst r1, #L_PTE_YOUNG
 tstne r1, #L_PTE_PRESENT
 moveq r3, #0

 ARM( str r3, [r0, #2048]! )
 mcr p15, 0, r0, c7, c10, 1  @ flush_pte
#endif
 mov pc, lr
ENDPROC(cpu_v7_set_pte_ext)

以上代码参考的是linux-3.7内核的代码。如果你手上有2.6.11内核源码的话(注意这个版本的内核不支持cortex-A9),发现在该版本内核的二级页表的linux页表和硬件页表和3.7内核放置是相反的,上半页是硬件页表,下半页是linux页表。请参文件pgtable.h(arch/asm-arm/):

/*
 * Hardware-wise, we have a two level page table structure, where the first
 * level has 4096 entries, and the second level has 256 entries.  Each entry
 * is one 32-bit word.  Most of the bits in the second level entry are used
 * by hardware, and there aren't any "accessed" and "dirty" bits.
 *
 * Linux on the other hand has a three level page table structure, which can
 * be wrapped to fit a two level page table structure easily - using the PGD
 * and PTE only.  However, Linux also expects one "PTE" table per page, and
 * at least a "dirty" bit.
 *
 * Therefore, we tweak the implementation slightly - we tell Linux that we
 * have 2048 entries in the first level, each of which is 8 bytes (iow, two
 * hardware pointers to the second level.)  The second level contains two
 * hardware PTE tables arranged contiguously, followed by Linux versions
 * which contain the state information Linux needs.  We, therefore, end up
 * with 512 entries in the "PTE" level.
 *
 * This leads to the page tables having the following layout:
 *
 *    pgd             pte
 * |        |
 * +--------+ +0
 * |        |-----> +------------+ +0
 * +- - - - + +4    |  h/w pt 0  |
 * |        |-----> +------------+ +1024
 * +--------+ +8    |  h/w pt 1  |
 * |        |       +------------+ +2048
 * +- - - - +       | Linux pt 0 |
 * |        |       +------------+ +3072
 * +--------+       | Linux pt 1 |
 * |        |       +------------+ +4096
 *
 * See L_PTE_xxx below for definitions of bits in the "Linux pt", and
 * PTE_xxx for definitions of bits appearing in the "h/w pt".
 *
 * PMD_xxx definitions refer to bits in the first level page table.
 *
 * The "dirty" bit is emulated by only granting hardware write permission
 * iff the page is marked "writable" and "dirty" in the Linux PTE.  This
 * means that a write to a clean page will cause a permission fault, and
 * the Linux MM layer will mark the page dirty via handle_pte_fault().
 * For the hardware to notice the permission change, the TLB entry must
 * be flushed, and ptep_establish() does that for us.
 *
 * The "accessed" or "young" bit is emulated by a similar method; we only
 * allow accesses to the page if the "young" bit is set.  Accesses to the
 * page will cause a fault, and handle_pte_fault() will set the young bit
 * for us as long as the page is marked present in the corresponding Linux
 * PTE entry.  Again, ptep_establish() will ensure that the TLB is up to
 * date.
 *
 * However, when the "young" bit is cleared, we deny access to the page
 * by clearing the hardware PTE.  Currently Linux does not flush the TLB
 * for us in this case, which means the TLB will retain the transation
 * until either the TLB entry is evicted under pressure, or a context
 * switch which changes the user space mapping occurs.
 */
#define PTRS_PER_PTE  512
#define PTRS_PER_PMD  1
#define PTRS_PER_PGD  2048

以ARM920t(s3c2410)为例,proc-arm920.S(arch/arm/mm)中cpu_arm920_set_pte函数:

/*
 * cpu_arm920_set_pte(ptep, pte)
 *
 * Set a PTE and flush it out
 */
 .align 5
ENTRY(cpu_arm920_set_pte)
 str r1, [r0], #-2048  @ linux version

 eor r1, r1, #L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_WRITE | L_PTE_DIRTY

 bic r2, r1, #PTE_SMALL_AP_MASK
 bic r2, r2, #PTE_TYPE_MASK
 orr r2, r2, #PTE_TYPE_SMALL

 tst r1, #L_PTE_USER   @ User?
 orrne r2, r2, #PTE_SMALL_AP_URO_SRW

 tst r1, #L_PTE_WRITE | L_PTE_DIRTY @ Write and Dirty?
 orreq r2, r2, #PTE_SMALL_AP_UNO_SRW

 tst r1, #L_PTE_PRESENT | L_PTE_YOUNG @ Present and Young?
 movne r2, #0

#ifdef CONFIG_CPU_DCACHE_WRITETHROUGH
 eor r3, r2, #0x0a   @ C & small page?
 tst r3, #0x0b
 biceq r2, r2, #4
#endif
 str r2, [r0]   @ hardware version
 mov r0, r0
 mcr p15, 0, r0, c7, c10, 1  @ clean D entry
 mcr p15, 0, r0, c7, c10, 4  @ drain WB
 mov pc, lr

很有趣。

后续分析待续。。。

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值