Virtual memory protection for user-space & kernel-space on ARMv7-a Architecture
Bit-dumping of ARMv7-a core-registers and control registers
Thanks to the great emulation software, QEMU, we now can easily debug a running Linux kernel with gdb, stopping any where we desire and inspect whatever confuses us. GNU debugger has a set of commands to fetch the contents of a given register, info register REGNAME
For more information, which is very handy for architectural kernel-hacking. However getting a 32-bit integer from a register is sometimes not enough, we need to further inspect certain bits of the register, and according to my limited knowledge, Gdb does not have a such functionality; what it does have is the ability to be extended with Python scripts. I have composed a simple Python script, adding a dump-reg
command for GDB, to bit-dump a given register:
When Linux kernel has nothing important to do, it will reschedule and run the idle task, which eventually put the processor into a low-power state. For ARMv7-a, a
WFI
instruction will stop the processor from fetching new instructions to execute, the processor simply stops running. But bit-dumping the CPSR register via dump-reg
command, we find that bit[7] is set, that means IRQ interrupts are masked, how could the processor revive from dead caused by the WFI
instruction? By searching the ARM Architecture Reference Manaul
, we can find the answer:
Aha, thanks to the dump-reg
command extended by a simple Python script, we’ve now learned more about ARM. The complete content of the Python script is listed as follow:
#!/usr/bin/env python3
# For ARMv7-a platform, define a register dumping class
class dumpRegister(gdb.Command):
"""Fetch a general or special control register and bit-dump it"""
def __init__(self):
super(dumpRegister, self).__init__("dump-reg", gdb.COMMAND_USER)
def dumpReg(self, regname, regval, last):
# convert register value to 32-bit unsigned value
regvalu = regval & 0xFFFFFFFF
regbit = list() # 32-bit register bit-array
for idx in range(32):
regbit.append("1" if (regvalu & (0x1 << idx)) != 0 else "0")
if (idx & 0x3) == 0x3:
regbit.append(" ")
regbit.reverse(); regbit[0] = " "
print("Register {0}, {1:#010x} ({2:d}, {3:d}):".format(regname, regvalu, regvalu, regval))
print(" 28 24 20 16 12 8 4 0")
print("".join(regbit))
print(" 31 27 23 19 15 11 7 3")
if not last:
print("---------------------------------------------------")
return True
def invoke(self, args, from_tty):
# split the argument string
argv = args.split(sep=None)
argc = len(argv); idx = 0
if argc == 0:
# the argument is an empty string
return False
# point to the ZERO frame
gdb.execute("select-frame 0")
# Get current stack frame
curFrame = gdb.selected_frame()
# create an integer type, of 4 bytes
longType = gdb.lookup_type('int')
for arg in argv:
if len(arg) == 0:
continue
idx += 1
regval = curFrame.read_register(arg)
self.dumpReg(arg, int(regval.cast(longType)), idx >= argc)
return True
dumpRegister()
Simple user-space application to aid debugging of virtual memory access
As a naive embedded software developer, I’m very curious about how the Linux kernel keeps user-space and kernel-space memory separated and isolated. The mechanism paves the fundamental way to constructing a modern OS kernel. I’ve wrote a simple application in C, to help us debug Linux kernel with QEMU
:
#include <errno.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <unistd.h>
#define MM_INVALID_FD 0x79656a71
#define MY_PAGE_SIZE 4096
#define RO_PAGE_NUM 1
#define RW_PAGE_NUM 16382
#define NA_PAGE_NUM 1
/* Total size: 16384 * 4096, 64MB -> */
#define PAGES_NUM (RO_PAGE_NUM + RW_PAGE_NUM + NA_PAGE_NUM)
static void dump_map(const char * mapFile)
{
int mfd, nfd;
char mapBuffer[1024];
mfd = open(mapFile, O_RDONLY | O_CLOEXEC);
if (mfd < 0) {
fprintf(stderr, "Error, failed to open %s: %s\n",
mapFile, strerror(errno));
fflush(stderr);
return;
}
nfd = 768;
if (nfd != dup2(mfd, nfd)) {
fprintf(stderr, "Error, failed to duplicate file descriptor %d: %s\n",
mfd, strerror(errno));
fflush(stderr);
close(mfd);
return;
}
if (mfd != nfd)
close(mfd);
for (;;) {
ssize_t rl1;
rl1 = read(nfd, mapBuffer, sizeof(mapBuffer));
if (rl1 <= 0)
break;
write(STDOUT_FILENO, mapBuffer, (size_t) rl1);
}
close(nfd);
}
int main(int argc, char *argv[])
{
int ret;
char mapfile[64];
unsigned long memaddr;
unsigned char * mptr, * mro, * mrw, * mna, * mend;
/* allocate memory, 64MB + 8192 bytes */
mptr = (unsigned char *) malloc((PAGES_NUM + 2) * MY_PAGE_SIZE);
if (mptr == NULL)
exit(8);
snprintf(mapfile, sizeof(mapfile), "/proc/%ld/maps", (long) getpid());
fprintf(stdout, "Allocated memory pointer: %p, PID: %ld\n", mptr, (long) getpid());
/* determine the page-aligned address */
memaddr = (unsigned long) mptr;
if (memaddr & (MY_PAGE_SIZE - 1))
memaddr = MY_PAGE_SIZE + (memaddr & ~(MY_PAGE_SIZE - 1));
/* set the read-only/read-write/no-access pointers */
mro = (unsigned char *) memaddr;
mrw = (unsigned char *) (memaddr + RO_PAGE_NUM * MY_PAGE_SIZE);
mna = (unsigned char *) (memaddr + (RO_PAGE_NUM + RW_PAGE_NUM) * MY_PAGE_SIZE);
mend = (unsigned char *) (memaddr + PAGES_NUM * MY_PAGE_SIZE);
fprintf(stdout, "User-Space pointers, RO: %p, RW: %p, NA: %p, END: %p\n",
mro, mrw, mna, mend);
fflush(stdout);
dump_map(mapfile);
usleep(500000); /* delay 0.5 second to ensure that above message has been emitted */
close(MM_INVALID_FD); /* close a specific "invalid" file-descriptor to aid kernel breakpoint */
/* clear the memory regions */
memset(mend, 0xee, MY_PAGE_SIZE); /* will this trigger a page-fault ? */
memset(mro, 0x5a, RO_PAGE_NUM * MY_PAGE_SIZE);
memset(mrw, 0xa5, RW_PAGE_NUM * MY_PAGE_SIZE);
memset(mna, 0x11, NA_PAGE_NUM * MY_PAGE_SIZE);
/* set the memory attributes */
ret = mprotect(mro, RO_PAGE_NUM * MY_PAGE_SIZE, PROT_READ);
if (ret == 0)
ret = mprotect(mna, NA_PAGE_NUM * MY_PAGE_SIZE, PROT_NONE);
if (ret != 0)
exit(9);
fputs("INFO: memory attributes have been set.\n", stdout);
fflush(stdout);
dump_map(mapfile);
usleep(500000);
close(MM_INVALID_FD); /* trigger a kernel breakpoint */
_exit(10); /* terminate process unexpectly */
}
Debugging Linux kernel with QEMU
, we cannot easily insert a breakpoint to the user-space applications, so I decided to insert a breakpoint in the system call close
implementation, which can only be triggered by close(MM_INVALID_FD)
:
Upon the breakpoint’s triggering, the user-space application has following output:
The user-space application has malloc-ed more than 64MB of memory from kernel, and output the pointers which are going to be set as read-only, read-write, no-access. Let’s take the END
pointer, 0x76eab000 for instance, I’ve composed another simple Python script, which exposes dump-virt
command to inspect a given virtual address, we can only infer that neither kernel or application has access to 0x76eab000:
How so? I think Linux kernel only gives the application a piece of phony memory, which is actually not allocated. By adding a watchpoint and then continuing to run, we know that the application triggered a Page-Fault, proving that our conjecture is fairly correct. We can further justify our conjecture by disassembling user-space code memset
:
We need to continue, for the application setting the mro
/mna
memory attributes to finish. Dumping the user-space virtual addresses, we can verify that access permissions. Note that user-space applications run at unprivileged level, PL0:
Kernel also has access to the user-space memory, but via different virtual addresses (not the virtual addresses used by applications), we can further dump the content of memset
-ed user-space memory:
How Linux kernel denies access of kernel memory from applications
Access-Permissions are usually 3 bits, indicating a piece of memory accessibility from different privilege levels. Applications run in PL0, the unprivileged level:
The above picture tells how the kernel keeps user-space applications from read/writing kernel memory. There is another question to be answered. According to AP, the user-space pointers should be accessible from Linux kernel, but it in fact cannot, why? Well, I think it is because the pointers are too low to be accessed, for example, pointer mro
is 0x72eab000, which is lower then 0x80000000, so GDB considers it an invalid kernel-space pointer, and does not allow us to access it:
Translation between virtual address and physical address
During the debugging session, there are many places where adding 0x20000000 to a physical address produces a virtual address. This is how the kernel transform some addresses, but not always valid. For example, the last statement in function void check_and_switch_contex
is cpu_switch_mm
, which transforms pgd
virtual address to its corresponding physical address before calling a function pointer, actually add the virtual address by 0xe0000000:
After kernel compilation, the assemble code at 328 bytes offset is actually add r0, r0, #0x81000000
; but during kernel booting, the instruction gets modified as add r0, r0, #0xe0000000
. It is an ingenious design, and doubtless a brilliant kernel-hacking trick: Indeed, the Linux kernel is a wonder-land for software developers! Durning kernel context switching, the new task’s PGD will be written to TTBR0
register, causing the kernel memory view to change to the new task’s memory Layout. All tasks, when running kernel code, share the same kernel memory.
Last but not least, here is the full listing of dumpVirtualMemory.py script, which defines dump-virt
command:
#!/usr/bin/env python3
# virtual memory dumper for ARMv7-a/Linux
# Created by xiaoqzye@qq,com
# 2020/09/05
import sys
class dumpVirtualMemory(gdb.Command):
"""Dump user-space & kernel-space virtual address for ARMv7-a/Linux on QEMU"""
def __init__(self):
super(dumpVirtualMemory, self).__init__("dump-virt", gdb.COMMAND_USER)
self.PXN = None # privileged execute-never
def dumpVirtPage(self, desc0, virtAddr, uintPtr):
index = (virtAddr >> 12) & 0xFF # the second-level table index
pte = desc0 & 0xFFFFFC00 # physical address of PTE
pte = pte + 0x20000000 # transform the physicall address to virtual address
pte = pte & 0xFFFFFFFF # avoid 32-bit unsigned integer overflow
pte += index * 0x4 # address of the indexed page table entry
pte = pte.cast(uintPtr) # cast it as a pointer
desc1 = pte.dereference() # get the second-level descriptor
print("2nd descriptor for {0:#010x}: {1:#x}".format(int(virtAddr), int(desc1)))
if (desc1 & 0x3) == 0:
print("Error, virtual address is not accessible: {0:#x}".format(int(virtAddr)))
return False
xn, pxn = True, (desc0 & 0x4) != 0 if self.PXN else False
phyAddr = 0 # corresponding physical address
isLargePage = (desc1 & 0x2) == 0
if isLargePage:
xn = (desc1 & (0x1 << 15)) != 0
phyAddr = (desc1 & 0xFFFF0000) + (virtAddr & 0xFFFF)
else:
xn = (desc1 & 0x1) != 0
phyAddr = (desc1 & 0xFFFFF000) + (virtAddr & 0xFFF)
# extract the Access permission
ap210 = ("1" if (desc1 & (0x1 << 9)) != 0 else "0") + \
("1" if (desc1 & (0x1 << 5)) != 0 else "0") + \
("1" if (desc1 & (0x1 << 4)) != 0 else "0")
print("Physical--ADDR for {0:#010x}: {1:#x}, AP: {2}, XN: {3}, PXN: {4}".format(
int(virtAddr), int(phyAddr), ap210, 0x1 if xn else 0x0, 0x1 if pxn else 0x0))
return True
def dumpVirtSection(self, desc1, virtAddr, uintPtr):
# determine whether the section is super or not
isSuperSection = (desc1 & (0x1 << 18)) != 0
xn = 1 if (desc1 & 0x1) != 0 else 0
pxn = (desc1 & 0x1) != 0 if self.PXN else False
phyAddr = 0 # translated physical address
if isSuperSection:
phyAddr = (desc1 & 0xFF000000) + (virtAddr & 0xFFFFFF)
else:
phyAddr = (desc1 & 0xFFF00000) + (virtAddr & 0xFFFFF)
# get the access permission
ap210 = ("1" if (desc1 & (0x1 << 15)) != 0 else "0") + \
("1" if (desc1 & (0x1 << 11)) != 0 else "0") + \
("1" if (desc1 & (0x1 << 10)) != 0 else "0")
print("Physical--ADDR for {0:#010x}: {1:#x}, AP: {2}, XN: {3}, PXN: {4}".format(
int(virtAddr), int(phyAddr), ap210, xn, 1 if pxn else 0))
return True
def dumpVirt(self, ttbr0, virtaddr, uinttype, uintptr):
# cast virtual address to an unsigned 32-bit integer
virtAddr = gdb.Value(virtaddr).cast(uinttype)
# refer to Section B3.5 in the ARM Architecture Reference Manual
index = (virtAddr >> 20) & 0xFFF # get the first-level table index
pgdAddr = ttbr0 + (index * 0x4) # physical address of PGD
pgdAddr = pgdAddr + 0x20000000 # transform the physical address to virtual address
pgdAddr = pgdAddr & 0xFFFFFFFF # avoid 32-bit unsigned integer overflow
pgdAddr = pgdAddr.cast(uintptr) # cast it to a pointer
desc0 = pgdAddr.dereference() # get the first-level descriptor
print("1st descriptor for {0:#010x}: {1:#x}".format(int(virtaddr), int(desc0)))
if (desc0 & 0x3) == 0:
print("Error, virtual address is not accessible: {0:#x}".format(int(virtaddr)), file=sys.stderr)
return False
if (desc0 & 0x3) == 1:
return self.dumpVirtPage(desc0, virtaddr, uintptr)
return self.dumpVirtSection(desc0, virtaddr, uintptr)
def invoke(self, args, from_tty):
argv = args.split(sep=None)
argc = len(argv)
if argc == 0:
# No argument(s) given
return False
# create an unsigned integer type
uintType = gdb.lookup_type('uint')
# create a pointer to an unsigned integer
uintPtr = uintType.pointer()
# Fetch current frame, any frame is okay
curFrm = gdb.newest_frame()
# TTBCR register should be 0, becase we can now only handle zero value
regval = curFrm.read_register('TTBCR_S').cast(uintType)
if regval != 0x0:
print("Error, unsupported TTBCR register: {0:x}".format(int(regval)), file=sys.stderr)
return False
if self.PXN is None:
# does the ARMv7-a SoC support PXN ?
regval = curFrm.read_register('ID_MMFR0_S').cast(uintType)
self.PXN = (regval & 0xF) >= 0x4
# Get TTBR0 register
regval = curFrm.read_register('TTBR0_EL1_S').cast(uintType)
# print("TTBR0 register: {0:#010x}".format(int(regval)))
TTBR0 = regval & 0xFFFFC000 # According to ARM Architecture Reference Manual, lower 14 bits should be cleared
for arg in argv:
virtAddr = 0
try:
virtAddr = int(arg, 0)
except ValueError as errv:
print(errv, file=sys.stderr)
continue
self.dumpVirt(TTBR0, virtAddr, uintType, uintPtr)
return True
dumpVirtualMemory()
Conclusion
I’ve successfully avoided discussing the MMU functions of ARMv7-a CPUs. MMU is a hardware and there are interesting things to know about, but difficult to talk about, so please refer to the ARM Architecture Reference Manual
available here if you want to know more.
- We can write Python scripts to enable GNU debugger to perform complex operations to aid our understanding of ARMv7-a and Linux kernel;
- User-space and kernel-space memory are in fact hold together by MMU, with Access Permission set differently for different Privilege Levels, User-Space applications run at PL0, the unprivileged level;
- Linux kernel has a fast mechanism to transform some virtual addresses to physical addresses, simply by adding or subtracting a fixed value, which have lower 24 bits cleared as zero. The value is initially 0x81000000, but gets modified during kernel booting;
- We can now dissemble user-space code from Linux kernel, by finding corresponding kernel-space virtual addresses of code sections: this will make debugging Linux kernel easier, Haha!