在进程中用户态地址如何得到物理地址

最新推荐文章于 2022-08-29 10:47:16 发布

dropping_1979

最新推荐文章于 2022-08-29 10:47:16 发布

阅读量1.9k

点赞数

分类专栏： kernel

本文链接：https://blog.csdn.net/dropping_1979/article/details/12616291

版权

kernel 专栏收录该内容

10 篇文章 1 订阅

订阅专栏

在3.9内核里drivers/staging/tidspbridge/core/tiomap3430.c中发现一个有意思的函数：

/*
* ======== user_va2_pa ========
* Purpose:
* This function walks through the page tables to convert a userland
* virtual address to physical address
*/
static u32 user_va2_pa(struct mm_struct *mm, u32 address)
{
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *ptep, pte;

pgd = pgd_offset(mm, address);
if (pgd_none(*pgd) || pgd_bad(*pgd))
return 0;

pud = pud_offset(pgd, address);
if (pud_none(*pud) || pud_bad(*pud))
return 0;

pmd = pmd_offset(pud, address);
if (pmd_none(*pmd) || pmd_bad(*pmd))
return 0;

ptep = pte_offset_map(pmd, address);
if (ptep) {
  pte = *ptep;
  if (pte_present(pte))
   return pte & PAGE_MASK;
}

return 0;
}

这个从进程的PGD一级页表开始，遍历各级页表，根据指定用户态虚拟地址得到物理地址。

分析：

对于32位ARM来说：

arch/arm/include/asm/pgtable.h中定义了

/* to find an entry in a page-table-directory */
#define pgd_index(addr) ((addr) >> PGDIR_SHIFT)

#define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))

而在arch/arm/include/asm/pgtable-2level.h中定义了

/*
* PMD_SHIFT determines the size of the area a second-level page table can map
* PGDIR_SHIFT determines what a third-level page table entry can map
*/
#define PMD_SHIFT 21
#define PGDIR_SHIFT 21

这里ARM的一级页表经过改造了，不是标准的硬件要求的20位SHIFT，即以1MB为单位的一级页表，这里实际上是2MB为单位的一级页表。

#define PTRS_PER_PTE  512
#define PTRS_PER_PMD  1
#define PTRS_PER_PGD  2048

而相应的一级和二级页表项也已经改成了2048和512个表项了，而不是原始的4096和256个表项了，从头文件的注释来看，主要ARM的硬件页表不支持“页脏”和“页young”属性，需要用Linux页表辅助ARM的硬件页表来完全支持linux系统。如果页大小是4KB的话，一个PAGE正好可以放1024个表项，把它分成上“半”页和下“半”页，各512个表项，上“半”页放的是linux页表项，而下“半”页放的是硬件页表项。所以我们会看到linux创建ARM页表时，填充完linux页表后，会加上2048个字节的偏移（半个页大小），然后填充硬件页表项。同时请参考代码pgtable-2level.h(arch/arm/include/asm/)中的注释，画的比较清晰。

/*
* Hardware-wise, we have a two level page table structure, where the first
* level has 4096 entries, and the second level has 256 entries. Each entry
* is one 32-bit word. Most of the bits in the second level entry are used
* by hardware, and there aren't any "accessed" and "dirty" bits.
*
* Linux on the other hand has a three level page table structure, which can
* be wrapped to fit a two level page table structure easily - using the PGD
* and PTE only. However, Linux also expects one "PTE" table per page, and
* at least a "dirty" bit.
*
* Therefore, we tweak the implementation slightly - we tell Linux that we
* have 2048 entries in the first level, each of which is 8 bytes (iow, two
* hardware pointers to the second level.) The second level contains two
* hardware PTE tables arranged contiguously, preceded by Linux versions
* which contain the state information Linux needs. We, therefore, end up
* with 512 entries in the "PTE" level.
*
* This leads to the page tables having the following layout:
*
*    pgd             pte
* |        |
* +--------+
* |        |       +------------+ +0
* +- - - - +       | Linux pt 0 |
* |        |       +------------+ +1024
* +--------+ +0    | Linux pt 1 |
* |        |-----> +------------+ +2048
* +- - - - + +4    | h/w pt 0 |
* |        |-----> +------------+ +3072
* +--------+ +8    | h/w pt 1 |
* |        |       +------------+ +4096
*
* See L_PTE_xxx below for definitions of bits in the "Linux pt", and
* PTE_xxx for definitions of bits appearing in the "h/w pt".
*
* PMD_xxx definitions refer to bits in the first level page table.
*
* The "dirty" bit is emulated by only granting hardware write permission
* iff the page is marked "writable" and "dirty" in the Linux PTE. This
* means that a write to a clean page will cause a permission fault, and
* the Linux MM layer will mark the page dirty via handle_pte_fault().
* For the hardware to notice the permission change, the TLB entry must
* be flushed, and ptep_set_access_flags() does that for us.
*
* The "accessed" or "young" bit is emulated by a similar method; we only
* allow accesses to the page if the "young" bit is set. Accesses to the
* page will cause a fault, and handle_pte_fault() will set the young bit
* for us as long as the page is marked present in the corresponding Linux
* PTE entry. Again, ptep_set_access_flags() will ensure that the TLB is
* up to date.
*
* However, when the "young" bit is cleared, we deny access to the page
* by clearing the hardware PTE. Currently Linux does not flush the TLB
* for us in this case, which means the TLB will retain the transation
* until either the TLB entry is evicted under pressure, or a context
* switch which changes the user space mapping occurs.
*/
#define PTRS_PER_PTE  512
#define PTRS_PER_PMD  1
#define PTRS_PER_PGD  2048

我们以cortex-A9为例，我们跟踪create_mapping(arch/arm/mm/mmu.c)函数中，最终填充页表项函数会调用：

/*
* cpu_v7_set_pte_ext(ptep, pte)
*
* Set a level 2 translation table entry.
*
* - ptep - pointer to level 2 translation table entry
*    (hardware version is stored at +2048 bytes)
* - pte   - PTE value to store
* - ext - value for extended PTE bits
*/
ENTRY(cpu_v7_set_pte_ext)
#ifdef CONFIG_MMU
str r1, [r0]   @ linux version

bic r3, r1, #0x000003f0
bic r3, r3, #PTE_TYPE_MASK
orr r3, r3, r2
orr r3, r3, #PTE_EXT_AP0 | 2

tst r1, #1 << 4
orrne r3, r3, #PTE_EXT_TEX(1)

eor r1, r1, #L_PTE_DIRTY
tst r1, #L_PTE_RDONLY | L_PTE_DIRTY
orrne r3, r3, #PTE_EXT_APX

tst r1, #L_PTE_USER
orrne r3, r3, #PTE_EXT_AP1
#ifdef CONFIG_CPU_USE_DOMAINS
@ allow kernel read/write access to read-only user pages
tstne r3, #PTE_EXT_APX
bicne r3, r3, #PTE_EXT_APX | PTE_EXT_AP0
#endif

tst r1, #L_PTE_XN
orrne r3, r3, #PTE_EXT_XN

tst r1, #L_PTE_YOUNG
tstne r1, #L_PTE_PRESENT
moveq r3, #0

ARM( str r3, [r0, #2048]! )
mcr p15, 0, r0, c7, c10, 1 @ flush_pte
#endif
mov pc, lr
ENDPROC(cpu_v7_set_pte_ext)

以上代码参考的是linux-3.7内核的代码。如果你手上有2.6.11内核源码的话(注意这个版本的内核不支持cortex-A9)，发现在该版本内核的二级页表的linux页表和硬件页表和3.7内核放置是相反的，上半页是硬件页表，下半页是linux页表。请参文件pgtable.h(arch/asm-arm/)：

/*
* Hardware-wise, we have a two level page table structure, where the first
* level has 4096 entries, and the second level has 256 entries. Each entry
* is one 32-bit word. Most of the bits in the second level entry are used
* by hardware, and there aren't any "accessed" and "dirty" bits.
*
* Linux on the other hand has a three level page table structure, which can
* be wrapped to fit a two level page table structure easily - using the PGD
* and PTE only. However, Linux also expects one "PTE" table per page, and
* at least a "dirty" bit.
*
* Therefore, we tweak the implementation slightly - we tell Linux that we
* have 2048 entries in the first level, each of which is 8 bytes (iow, two
* hardware pointers to the second level.) The second level contains two
* hardware PTE tables arranged contiguously, followed by Linux versions
* which contain the state information Linux needs. We, therefore, end up
* with 512 entries in the "PTE" level.
*
* This leads to the page tables having the following layout:
*
*    pgd             pte
* |        |
* +--------+ +0
* |        |-----> +------------+ +0
* +- - - - + +4    | h/w pt 0 |
* |        |-----> +------------+ +1024
* +--------+ +8    | h/w pt 1 |
* |        |       +------------+ +2048
* +- - - - +       | Linux pt 0 |
* |        |       +------------+ +3072
* +--------+       | Linux pt 1 |
* |        |       +------------+ +4096
*
* See L_PTE_xxx below for definitions of bits in the "Linux pt", and
* PTE_xxx for definitions of bits appearing in the "h/w pt".
*
* PMD_xxx definitions refer to bits in the first level page table.
*
* The "dirty" bit is emulated by only granting hardware write permission
* iff the page is marked "writable" and "dirty" in the Linux PTE. This
* means that a write to a clean page will cause a permission fault, and
* the Linux MM layer will mark the page dirty via handle_pte_fault().
* For the hardware to notice the permission change, the TLB entry must
* be flushed, and ptep_establish() does that for us.
*
* The "accessed" or "young" bit is emulated by a similar method; we only
* allow accesses to the page if the "young" bit is set. Accesses to the
* page will cause a fault, and handle_pte_fault() will set the young bit
* for us as long as the page is marked present in the corresponding Linux
* PTE entry. Again, ptep_establish() will ensure that the TLB is up to
* date.
*
* However, when the "young" bit is cleared, we deny access to the page
* by clearing the hardware PTE. Currently Linux does not flush the TLB
* for us in this case, which means the TLB will retain the transation
* until either the TLB entry is evicted under pressure, or a context
* switch which changes the user space mapping occurs.
*/
#define PTRS_PER_PTE  512
#define PTRS_PER_PMD  1
#define PTRS_PER_PGD  2048

以ARM920t(s3c2410)为例，proc-arm920.S(arch/arm/mm)中cpu_arm920_set_pte函数：

/*
* cpu_arm920_set_pte(ptep, pte)
*
* Set a PTE and flush it out
*/
.align 5
ENTRY(cpu_arm920_set_pte)
str r1, [r0], #-2048 @ linux version

eor r1, r1, #L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_WRITE | L_PTE_DIRTY

bic r2, r1, #PTE_SMALL_AP_MASK
bic r2, r2, #PTE_TYPE_MASK
orr r2, r2, #PTE_TYPE_SMALL

tst r1, #L_PTE_USER @ User?
orrne r2, r2, #PTE_SMALL_AP_URO_SRW

tst r1, #L_PTE_WRITE | L_PTE_DIRTY @ Write and Dirty?
orreq r2, r2, #PTE_SMALL_AP_UNO_SRW

tst r1, #L_PTE_PRESENT | L_PTE_YOUNG @ Present and Young?
movne r2, #0

#ifdef CONFIG_CPU_DCACHE_WRITETHROUGH
eor r3, r2, #0x0a   @ C & small page?
tst r3, #0x0b
biceq r2, r2, #4
#endif
str r2, [r0]   @ hardware version
mov r0, r0
mcr p15, 0, r0, c7, c10, 1  @ clean D entry
mcr p15, 0, r0, c7, c10, 4  @ drain WB
mov pc, lr

很有趣。

后续分析待续。。。

dropping_1979

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
在进程中用户态地址如何得到物理地址

在3.9内核里drivers/staging/tidspbridge/core/tiomap3430.c中发现一个有意思的函数：/* * ======== user_va2_pa ======== * Purpose: * This function walks through the page tables to convert a userland *
复制链接

扫一扫