用户态程序经常需要动态分配一块内存,比如使用glibc的malloc。linux下malloc底层会调用操作系统提供的系统调用,如brk, mmap来跟操作系统动态申请内存。而brk, mmap只是获得了一块虚拟内存空间,当真正访问这些内存时,由于没有物理页,会发生缺页中断,此时操作系统负责寻找一个空闲的物理页面,并跟虚拟地址建立映射关系。
babyos2目前已经拥有了物理内存管理系统(buddy),一套中断异常处理流程,实现缺页处理的时机已经成熟。
我们不妨动态获取一个虚拟内存地址,并在用户态访问看看会发生什么。
void test_fault()
{
unsigned char* p = (unsigned char*) 0x40000000;
p[0] = 0;
}
int main()
{
uint32 cs = 0xffffffff;
__asm__ volatile("movl %%cs, %%eax" : "=a" (cs));
int i = 0;
while (buffer[i] != '\0') {
i++;
}
char num[9] = {0};
int j = 0;
while (cs != 0) {
num[j++] = digits[cs % 16];
cs /= 16;
}
while (j) {
buffer[i++] = num[--j];
}
buffer[i++] = '\n';
print(buffer);
// fork
int32 ret = fork();
if (ret == 0) {
// child
buffer2[0] = 'I';
buffer2[1] = 'C';
buffer2[2] = ',';
buffer2[3] = '\0';
while (1) {
for (int i = 0; i < 100000000; i++) ;
print(buffer2);
}
}
else {
// parent
// test page fault
test_fault();
buffer[0] = 'I';
buffer[1] = ',';
buffer[2] = '\0';
while (1) {
for (int i = 0; i < 100000000; i++) ;
print(buffer);
}
}
return 0;
}
因为还没有malloc之类的内存分配函数,我们先指定一个特定地址,然后做访问试试看。我们让用户态进程init fork一个子进程,子进程一直打印IC,父进程调用test_fault去访问地址0x40000000.
可以发现,系统检测到发生了PAGE FAULT,并且cr2寄存器里存放的是发生缺页的地址0x40000000.由于尚未实现信号,exit之类的让父进程退出,暂时一直halt(),而内核态进程idle一直在打印P, 子进程一直在打印IC。
然后尝试去处理缺页。当发生缺页异常时,分配一个物理页,并建立映射关系:
uint32 vmm_t::do_page_fault(trap_frame_t* frame)
{
uint32 addr = 0xffffffff;
__asm__ volatile("movl %%cr2, %%eax" : "=a" (addr));
console()->kprintf(RED, "do_page_fault, addr: %x\n", addr);
addr = (addr & PAGE_MASK);
void* mem = os()->get_mm()->alloc_pages(0);
pde_t* pg_dir = os()->get_mm()->get_pg_dir();
os()->get_mm()->map_pages(pg_dir, (void*) addr, VA2PA(mem), PAGE_SIZE, PTE_W | PTE_U);
console()->kprintf(GREEN, "alloc and map pages\n");
return 0;
}
void cpu_t::do_exception(trap_frame_t* frame)
{
uint32 trapno = frame->trapno;
if (trapno <= 0x10) {
console()->kprintf(RED, "Exception: %s\n", exception_msg[trapno]);
console()->kprintf(RED, "current: %p, pid: %p\n", current, current->m_pid);
console()->kprintf(RED, "errno: %x, eip: %x, cs: %x, esp: %x\n", frame->err, frame->eip, frame->cs, frame->esp);
if (trapno == INT_PF) {
/* if success to process the page fault, just return, else halt forever... */
if (current->m_vmm.do_page_fault(frame) == 0) {
return;
}
}
}
else {
console()->kprintf(RED, "Error Interrupt: %x, RESERVED!\n", trapno);
}
while (1) {
halt();
}
}
可以发现,当我们在缺页异常处理中分配一个页,并与该虚拟地址建立映射后,进程能够继续执行,一直打印I。
但是这个处理方法过于粗暴,总不能随便来个地址发生缺页就分配物理页,并建立映射关系吧?于是我们引入vm_area,用来管理虚拟内存区间。
/*
* 2017-11-27
* guzhoudiaoke@126.com
*/
#ifndef _VM_H_
#define _VM_H_
#include "types.h"
#define USER_VM_SIZE (0xc0000000)
#define VM_UNMAPPED_BASE (USER_VM_SIZE / 3)
/* protect flags */
#define PROT_NONE 0x0 /* page can not be accessed */
#define PROT_READ 0x1 /* page can be read */
#define PROT_WRITE 0x2 /* page can be written */
#define PROT_EXEC 0x4 /* page can be executed */
/* map flags */
#define MAP_FIXED 0x10 /* Interpret addr exactly */
class vmm_t;
typedef struct vm_area_s {
vmm_t* m_vm_mm;
uint32 m_start;
uint32 m_end;
uint32 m_page_prot;
uint32 m_flags;
struct vm_area_s* m_next;
} vm_area_t;
class vmm_t {
public:
void init();
uint32 do_mmap(uint32 addr, uint32 len, uint32 prot, uint32 flags);
uint32 do_munmap(uint32 addr, uint32 len);
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
vm_area_t* find_vma(uint32 addr);
vm_area_t* find_vma(uint32 addr, vm_area_t*& prev);
uint32 insert_vma(vm_area_t* vma);
void remove_vma(vm_area_t* vma, vm_area_t* prev);
uint32 get_unmapped_area(uint32 len);
uint32 do_page_fault(trap_frame_t* frame);
private:
vm_area_t* m_mmap;
///
uint32 m_code_start, m_code_end;
uint32 m_data_start, m_data_end;
uint32 m_brk_start, m_brk;
uint32 m_stack_start;
uint32 m_arg_start, m_arg_end;
uint32 m_env_start, m_env_end;
};
#endif
vm_area_t 有一个start,一个end用于表示一个虚拟内存范围,及prot,flags用于标记,next用于建立链接。
vmm_t 中m_mmap是一个头指针,是一个vm_area_t链表的头。后面一堆start, end暂时先不用。
/*
* 2017-11-27
* guzhoudiaoke@126.com
*/
#include "babyos.h"
#include "vm.h"
#include "mm.h"
#include "x86.h"
void vmm_t::init()
{
m_mmap = NULL;
}
/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
vm_area_t* vmm_t::find_vma(uint32 addr)
{
vm_area_t* vma = m_mmap;
while (vma != NULL) {
if (addr < vma->m_end) {
return vma;
}
vma = vma->m_next;
}
return NULL;
}
vm_area_t* vmm_t::find_vma(uint32 addr, vm_area_t*& prev)
{
prev = NULL;
vm_area_t* vma = m_mmap;
while (vma != NULL) {
if (addr < vma->m_end) {
return vma;
}
prev = vma;
vma = vma->m_next;
}
return NULL;
}
目前babyos2只用链表维护vm_area_t,所以查找比较简单,就是线性查找。
/*
* for now, the mmap should:
* 1) if MAP_FIXED, addr should align with PAGE_SIZE
* 2) [addr, addr+len] not intersect with a vma already exist
*/
uint32 vmm_t::do_mmap(uint32 addr, uint32 len, uint32 prot, uint32 flags)
{
console()->kprintf(YELLOW, "do_mmap: 0x%x, 0x%x, 0x%x, 0x%x\n", addr, len, prot, flags);
/* make len align with PAGE_SIZE */
len = PAGE_ALIGN(len);
/* len is 0, nothing to do */
if (len == 0) {
return addr;
}
/* out of range */
if (len > USER_VM_SIZE || addr > USER_VM_SIZE || addr > USER_VM_SIZE - len) {
return -1;
}
/* if MAP_FIXED, the addr should align with PAGE_SIZE */
if (flags & MAP_FIXED) {
if (addr & ~PAGE_MASK) {
return -1;
}
/* check [addr, addr+len] not in a vm_area */
vm_area_t* p = find_vma(addr);
if (p != NULL && addr + len > p->m_start) {
return -1;
}
}
else {
addr = get_unmapped_area(len);
if (addr == 0) {
return -1;
}
}
/* alloc a vma from pool */
vm_area_t* vma = (vm_area_t*) os()->get_obj_pool(VMA_POOL)->alloc_from_pool();
if (vma == NULL) {
return -1;
}
/* setup vma */
vma->m_start = addr;
vma->m_end = addr + len;
vma->m_flags = 0;
vma->m_page_prot = prot;
/* insert vma into list, and do merge */
if (insert_vma(vma)) {
return -1;
}
return addr;
}
do_mmap会对len做页对齐,即长度一定是页大小的整数倍。
然后检测是否设置了MAP_FIXED标记,若设置了,表示需要按照指定的addr来分配虚拟内存区间,这时addr必须按页对齐,若未对齐,则简单返回错误;然后查找现有的vm_area,若找到一个区间addr + len > p->m_start则表示无法分配给定的地址[addr, addr+len],简单返回错误。
若未设置MAP_FIXED,则表示自动查找一块的虚拟内存区间:
uint32 vmm_t::get_unmapped_area(uint32 len)
{
uint32 addr = VM_UNMAPPED_BASE;
vm_area_t* vma = find_vma(addr);
while (vma != NULL) {
if (USER_VM_SIZE - len < addr) {
return 0;
}
if (addr + len <= vma->m_start) {
return addr;
}
addr = vma->m_end;
vma = vma->m_next;
}
return addr;
}
该函数从VM_UNMAPPED_BASE开始一直往后找,找到一块不与现有vm_area重合的满足长度len的区间。
之后需要分配一个vm_area_t结构,因为没有实现kmalloc之类的内核态内存管理函数,要分配一个较小的结构,babyos2实现了一个简单的object pool:
/*
* 2017-11-27
* guzhoudiaoke@126.com
*/
#ifndef _POOL_H_
#define _POOL_H_
#include "types.h"
typedef struct object_pool_obj_s {
struct object_pool_obj_s* m_next;
} object_pool_obj_t;
class object_pool_t {
public:
void init(uint32 obj_size);
void free_object(void* obj);
void* alloc_from_pool();
uint32 get_available();
private:
uint32 m_obj_size;
uint32 m_available;
object_pool_obj_t* m_free_list;
};
#endif
/*
* 2017-11-27
* guzhoudiaoke@126.com
*/
#include "babyos.h"
#include "mm.h"
#include "pool.h"
void object_pool_t::init(uint32 obj_size)
{
m_obj_size = obj_size;
m_available = 0;
m_free_list = NULL;
}
void object_pool_t::free_object(void* obj)
{
object_pool_obj_t* o = (object_pool_obj_t*) obj;
o->m_next = NULL;
if (m_free_list == NULL) {
m_free_list = o;
}
else {
o->m_next = m_free_list;
m_free_list = o;
}
m_available++;
}
void* object_pool_t::alloc_from_pool()
{
if (m_free_list == NULL) {
uint8* mem = (uint8 *) os()->get_mm()->alloc_pages(0);
uint8* end = mem + PAGE_SIZE;
while (mem + m_obj_size <= end) {
free_object(mem);
mem += m_obj_size;
}
}
void* obj = m_free_list;
m_free_list = m_free_list->m_next;
m_available--;
return obj;
}
uint32 object_pool_t::get_available()
{
return m_available;
}
当需要分配一个对象时,若已无空闲对象,则分配一页内存并拆分成对象链接到空闲链表中去。需要释放时则放回链表。
分配到vm_area_t之后,设置该结构的start, end及flags等,然后按顺序插入到链表中去:
uint32 vmm_t::insert_vma(vm_area_t* vma)
{
vm_area_t* prev = NULL;
vm_area_t* p = m_mmap;
while (p != NULL) {
if (p->m_start >= vma->m_end) {
break;
}
prev = p;
p = p->m_next;
}
vma->m_next = p;
if (prev != NULL) {
prev->m_next = vma;
}
else {
m_mmap = vma;
}
/* merge prev and vma */
if (prev != NULL && prev->m_end == vma->m_start) {
if (prev->m_page_prot == vma->m_page_prot && prev->m_flags == vma->m_flags) {
prev->m_end = vma->m_end;
prev->m_next = p;
os()->get_obj_pool(VMA_POOL)->free_object(vma);
vma = prev;
}
}
/* merge vma and p */
if (p != NULL && vma->m_end == p->m_start) {
if (vma->m_page_prot == p->m_page_prot && vma->m_flags == p->m_flags) {
vma->m_end = p->m_end;
vma->m_next = p->m_next;
os()->get_obj_pool(VMA_POOL)->free_object(p);
}
}
return 0;
}
该函数首先找到插入的位置,然后将新的vm_area_t插入链表中,然后检测是否可以跟前面一个、后面一个vm_area_t合并,若可以则进行合并,并释放vm_area_t结构。
至此do_mmap就基本完成了。
do_munmap类似:
uint32 vmm_t::do_munmap(uint32 addr, uint32 len)
{
/* addr should align with PAGE_SIZE */
if ((addr & ~PAGE_MASK) || addr > USER_VM_SIZE || len > USER_VM_SIZE-addr) {
return -1;
}
/* len is 0, nothing to do */
if ((len = PAGE_ALIGN(len)) == 0) {
return 0;
}
/* find the vma, addr < vma->m_end */
vm_area_t* prev = NULL;
vm_area_t* vma = find_vma(addr, prev);
if (vma == NULL) {
return -1;
}
/* make sure m_start <= addr< addr+len <= m_end */
if (addr < vma->m_start || addr+len > vma->m_end) {
return -1;
}
/* alloc a new vma, because the vma may split to 2 vma, such as:
* [start, addr, addr+len, end] => [start, addr], [addr+len, end] */
vm_area_t* vma_new = (vm_area_t*) os()->get_obj_pool(VMA_POOL)->alloc_from_pool();
if (vma_new == NULL) {
return -1;
}
/* set up the new vma and link to list */
vma_new->m_start = addr+len;
vma_new->m_end = vma->m_end;
vma->m_end = addr;
vma_new->m_next = vma->m_next;
vma->m_next = vma_new;
/* check if first part need to remove */
if (vma->m_start == vma->m_end) {
remove_vma(vma, prev);
vma = prev;
}
/* check if second part need to remove */
if (vma_new->m_start == vma_new->m_end) {
remove_vma(vma_new, vma);
}
/* need to unmap the physical page */
//unmap_page_range(addr, len);
return 0;
}
do_munmap的addr需要按页对齐,len也需要做对齐,然后查找addr所在的vm_area,因为删除一块区间可能把一个原有的区间分成两半,所以我们先分配一个新的vm_area_t结构,并设置响应的start, end等,然后链接到链表中去。
然后检查两半,若长度为0,则调用remove_vma从链表中删除:
void vmm_t::remove_vma(vm_area_t* vma, vm_area_t* prev)
{
if (prev != NULL) {
prev->m_next = vma->m_next;
}
else {
m_mmap = vma->m_next;
}
os()->get_obj_pool(VMA_POOL)->free_object(vma);
}
当然,对于建立了物理地址映射的区间,需要释放物理页,这里暂时不管。
这样一个简单的虚拟内存管理的vm_area区间分配和释放就完成了,怎样测试一下呢,需要实现一个简单的系统调用,sys_mmap:
int32 sys_mmap(trap_frame_t* frame)
{
uint32 addr = frame->ebx, len = frame->ecx, prot = frame->edx, flags = frame->esi;
addr = current->m_vmm.do_mmap(addr, len, prot, flags);
return addr;
}
然后在用户态进程中:
void *mmap(uint32 addr, uint32 len, uint32 prot, uint32 flags)
{
uint32 ret = 0;
__asm__ volatile("int $0x80" : "=a" (ret) : "a" (SYS_MMAP), "b" (addr), "c" (len), "d" (prot), "S" (flags));
return (void*) ret;
}
void test_fault()
{
unsigned char* m = (unsigned char*) mmap(0, 4096, 0, 0);
m[0] = 0;
unsigned char* p = (unsigned char*) 0x50000000;
p[0] = 0;
}
这样当调用mmap时,内核态会分配一块虚拟内存区间,这样我们就可以在缺页中断中判断缺页的地址是否在一个已分配的虚拟地址区间内,若是,则分配物理页建立映射关系,否则报segment fault:
uint32 vmm_t::do_page_fault(trap_frame_t* frame)
{
uint32 addr = 0xffffffff;
__asm__ volatile("movl %%cr2, %%eax" : "=a" (addr));
console()->kprintf(RED, "do_page_fault, addr: %x\n", addr);
addr = (addr & PAGE_MASK);
vm_area_t* vma = find_vma(addr);
if (vma == NULL) {
if (frame->err & 0x4) {
console()->kprintf(RED, "segment fault!\n");
}
return -1;
}
if (vma->m_start <= addr) {
void* mem = os()->get_mm()->alloc_pages(0);
pde_t* pg_dir = os()->get_mm()->get_pg_dir();
os()->get_mm()->map_pages(pg_dir, (void*) addr, VA2PA(mem), PAGE_SIZE, PTE_W | PTE_U);
console()->kprintf(GREEN, "alloc and map pages\n");
}
return 0;
}
可以看到,对mmap得到的地址0x40000000,pagefault会alloc and map page,然后进程继续执行,但是之后访问随便给定的地址0x50000000,发生pagefault后,由于该地址不在一个已分配的虚拟内存区间,不会分配物理页并建立映射关系,所以该进程halt()。
这样babyos2就有了一个比较简单的缺页处理流程。后续需要实现的地方:
1)mmap需要管理内存访问权限,pagefault需要处理权限问题,比如写只读的页面;
2)fork需要拷贝页表及映射关系,目前的fork还过于简单,页表都是同一个。并且copy的时候需要将相应页设置为只读,当发生写时需要重新分配页、拷贝内容,并建立映射,即COW相关机制;
3)对于发生写保护的异常,需要检测是否是因为fork时设置成只读,COW导致的,则需要分配页并拷贝内容;
4)每个进程有自己的页表;
5)sys_exec时,读取elf文件,并加载数据段、代码段,并分别分配虚拟内存区间;
6)用户态栈需要有一个单独的vm_area, 位于用户地址空间的最高地址,并向下增长;
7)用户栈的扩展,当超过现有栈范围时发生缺页,扩展用户态栈;
8)sys_brk;
9)用户态简单的malloc函数;
看来即使做一个最最简单的虚拟内存管理系统也还任重而道远……