babyos2(13)—— process page table,fork, COW(copy on write)

尽管已经实现了分页,但目前babyos2各个进程使用的还是同一个页表,所以一旦一个进程mmap了一块虚拟地址空间,其他各个进程都是可以使用的,这是不合理的。我们先验证这个现象:

unsigned *address;
inline void visit_address()
{
    for (unsigned i = 0; i < 10; i++) {
        address[i] = i;
    }
}

inline void test_fault()
{
    address = (unsigned *) mmap(0, 4096, 0, 0);
    visit_address();
}

unsigned time1 = 0;
static char str_child[16] = "\nchild\n";
int main()
{
    uint32 cs = 0xffffffff;
    __asm__ volatile("movl %%cs, %%eax" : "=a" (cs));

    int i = 0;
    while (buffer[i] != '\0') {
        i++;
    }

    char num[9] = {0};
    int j = 0;
    while (cs != 0) {
        num[j++] = digits[cs % 16];
        cs /= 16;
    }

    while (j) {
        buffer[i++] = num[--j];
    }
    buffer[i++] = '\n';
    print(buffer);

    // fork
    int32 ret = fork();
    if (ret == 0) {
        buffer2[0] = 'I';
        buffer2[1] = 'C';
        buffer2[2] = ',';
        buffer2[3] = '\0';
        while (1) {
            for (int i = 0; i < 100000000; i++) ;
            print(buffer2);

            if (++time1 == 5) {
                print(str_child);
                visit_address();
            }
        }
    }
    else {
        test_fault();

        buffer[0] = 'I';
        buffer[1] = ',';
        buffer[2] = '\0';
        while (1) {
            for (int i = 0; i < 100000000; i++) ;
            print(buffer);
        }
    }

    return 0;
}

目前babyos2还没有任何的库,甚至fork后栈都是无法正常使用的,所以写的代码比较low,目前只为测试,后面逐渐完善。
父进程开始执行后会mmap一块地址空间,然后访问,子进程会延迟几秒之后,也去访问这块地址,这在正常系统中,对非共享的内存肯定是不行的,但babyos2目前应该是可以的:
这里写图片描述

我们可以看见,子进程在打印”child”之后依然可以执行下去,即没有发生缺页而导致halt。

下面我们就解决这个问题,让每个进程拥有独立的页表:
1) fork copy 页表及vm_area

process_t* cpu_t::fork(trap_frame_t* frame)
{
    // alloc a process_t
    process_t* p = (process_t *)os()->get_mm()->alloc_pages(1);
    *p = *current;

    // kstack
    trap_frame_t* child_frame = ((trap_frame_t *) ((uint32(p) + PAGE_SIZE*2))) - 1;
    memcpy(child_frame, frame, sizeof(trap_frame_t));
    child_frame->eax = 0;

    // vmm
    p->m_vmm.copy(current->m_vmm);

    // context
    p->m_context.esp = (uint32) child_frame;
    p->m_context.esp0 = (uint32) (child_frame + 1);
    p->m_context.eip = (uint32) ret_from_isr;

    // pid, need check if same with other process
    p->m_pid = m_next_pid++;

    // change state
    p->m_state = PROCESS_ST_RUNABLE;
    p->m_need_resched = 1;

    // link to list
    p->m_next = m_idle_process;
    p->m_prev = m_idle_process->m_prev;
    m_idle_process->m_prev->m_next = p;
    m_idle_process->m_prev = p;

    return p;
}

uint32 vmm_t::copy(const vmm_t& vmm)
{
    if (copy_page_table(vmm.m_pg_dir)) {
        return -1;
    }

    if (copy_vma(vmm.m_mmap)) {
        return -1;
    }

    return 0;
}

uint32 vmm_t::copy_page_table(pde_t* pg_dir)
{
    m_pg_dir = (pde_t *) os()->get_mm()->alloc_pages(0);
    if (m_pg_dir == NULL) {
        return -1;
    }

    memcpy(m_pg_dir, pg_dir, PAGE_SIZE);
    for (uint32 i = 0; i < KERNEL_BASE/(4*MB); i++) {
        pde_t *pde = &pg_dir[i];
        if (!(*pde & PTE_P)) {
            continue;
        }

        pte_t* table = (pte_t *) PA2VA((*pde) & PAGE_MASK);
        pte_t* new_table = (pte_t *) os()->get_mm()->alloc_pages(0);
        if (new_table == NULL) {
            return -1;
        }

        /* copy table and set to pg_dir */
        memcpy(new_table, table, PAGE_SIZE);
        m_pg_dir[i] = (VA2PA(new_table) | (*pde & (~PAGE_MASK)));
    }

    return 0;
}

uint32 vmm_t::copy_vma(vm_area_t* mmap)
{
    vm_area_t* tail = NULL;
    vm_area_t* p = mmap;
    while (p != NULL) {
        vm_area_t* vma = (vm_area_t *) os()->get_obj_pool(VMA_POOL)->alloc_from_pool();
        if (vma == NULL) {
            return -1;
        }

        *vma = *p;
        vma->m_next = NULL;

        if (tail == NULL) {
            m_mmap = vma;
        }
        else {
            tail->m_next = vma;
        }
        tail = vma;

        p = p->m_next;
    }

    return 0;
}

2) 调度时加载当前进程的页表

void cpu_t::schedule()
{
    if ((uint32)current % 8192 != 0) {
        console()->kprintf(RED, "ERROR current: %p\n", current);
    }

    process_t* prev = current;
    process_t* next = current->m_next;
    if (prev == next) {
        return;
    }

    set_cr3(VA2PA(next->m_vmm.get_pg_dir()));

    prev->m_need_resched = 0;
    switch_to(prev, next, prev);
}

这里写图片描述

可以发现,在使用了不同的页表后,子进程打印child后,尝试访问address,发生缺页异常,并halt。

Linux fork一个很著名的概念叫COW,即copy on write,即写时复制,babyos2上面copy页表时,也只是将子进程的页表映射到了同一个页表,并没有申请新的物理页做映射,这样当父进程或子进程一方要写这个页面时就会面临问题,一个进程改变了这个页的内容,另一个进程对应的内容也会改变,这是不合理的。babyos2采用了跟linux类似的处理方法,
1) copy页表时将页面设置为只读
2) 为物理页增加一个引用计数,copy时计数加一
3) 当写时,因为页面是只读的,所以会发生缺页,此时检测,若是因为COW导致的权限问题,分配新页,拷贝内容,并做映射。

/*
 * 2017-11-28
 * guzhoudiaoke@126.com
 */

#ifndef _ATOMIC_H_
#define _ATOMIC_H_

typedef struct atomic_s { 
    volatile int counter;
} atomic_t;


#define ATOMIC_INIT(i)  { (i) }
#define atomic_read(v)      ((v)->counter)
#define atomic_set(v,i)     (((v)->counter) = (i))

static __inline__ void atomic_add(int i, atomic_t *v)
{
    __asm__ __volatile__(
            "lock; addl %1,%0"
            :"=m" (v->counter)
            :"ir" (i), "m" (v->counter));
}

static __inline__ void atomic_sub(int i, atomic_t *v)
{
    __asm__ __volatile__(
            "lock; subl %1,%0"
            :"=m" (v->counter)
            :"ir" (i), "m" (v->counter));
}

static __inline__ int atomic_sub_and_test(int i, atomic_t *v)
{
    unsigned char c;

    __asm__ __volatile__(
            "lock; subl %2,%0; sete %1"
            :"=m" (v->counter), "=qm" (c)
            :"ir" (i), "m" (v->counter) : "memory");
    return c;
}

static __inline__ void atomic_inc(atomic_t *v)
{
    __asm__ __volatile__(
            "lock; incl %0"
            :"=m" (v->counter)
            :"m" (v->counter));
}

static __inline__ void atomic_dec(atomic_t *v)
{
    __asm__ __volatile__(
            "lock; decl %0"
            :"=m" (v->counter)
            :"m" (v->counter));
}

static __inline__ int atomic_dec_and_test(atomic_t *v)
{
    unsigned char c;

    __asm__ __volatile__(
            "lock; decl %0; sete %1"
            :"=m" (v->counter), "=qm" (c)
            :"m" (v->counter) : "memory");
    return c != 0;
}

static __inline__ int atomic_inc_and_test(atomic_t *v)
{
    unsigned char c;

    __asm__ __volatile__(
            "lock; incl %0; sete %1"
            :"=m" (v->counter), "=qm" (c)
            :"m" (v->counter) : "memory");
    return c != 0;
}

#endif

typedef struct page_s {
    atomic_t    ref;
} page_t;

class mm_t {
public:
    mm_t();
    ~mm_t();

    void init();
    void* boot_mem_alloc(uint32 size, uint32 page_align);
    pde_t* get_kernel_pg_dir() { return m_kernel_pg_dir; } 
    void map_pages(pde_t *pg_dir, void *va, uint32 pa, uint32 size, uint32 perm);

    void* alloc_pages(uint32 order);
    void free_pages(void* addr, uint32 order);

    void inc_page_ref(void* addr);
    uint32 dec_page_ref(void* addr);
    uint32 get_page_ref(void* addr);

private:
    void test_page_mapping();
    void init_paging();
    void init_mem_range();
    void init_free_area();
    void boot_map_pages(pde_t *pg_dir, void *va, uint32 pa, uint32 size, uint32 perm);

    uint32 get_buddy(uint32 addr, uint32 mask);
    int mark_used(uint32 addr, uint32 order);
    void* expand(free_list_t* addr, uint32 low, uint32 high);
    void free_boot_mem();

    void init_pages();

private:
    pde_t *m_kernel_pg_dir;
    page_t *m_pages;

    uint8 *m_mem_start;
    uint8 *m_mem_end;

    uint32 m_usable_phy_mem_start;
    uint32 m_usable_phy_mem_end;

    free_area_t m_free_area;
};

增加atomic处理函数,及一个page_t结构,目前这个结构只有一个引用计数ref, 然后mm类中增加 page_t* m_pages表示所有物理页。

void mm_t::init_pages()
{
    uint32 size = ((uint32)m_mem_end + PAGE_SIZE - 1) / PAGE_SIZE * sizeof(page_t);
    m_pages = (page_t *) boot_mem_alloc(size, 0);
    memset(m_pages, 0, size);

    for (uint32 addr = KERNEL_BASE; addr < (uint32)m_mem_end; addr += PAGE_SIZE) {
        inc_page_ref((void *) addr);
    }
}

void mm_t::init_free_area()
{
    uint32 mask = PAGE_MASK;
    uint32 bitmap_size;
    for (int i = 0; i <= MAX_ORDER; i++) {
        m_free_area.free_list[i].prev = m_free_area.free_list[i].next = &m_free_area.free_list[i];
        mask += mask;
        m_mem_end = (uint8 *)(((uint32)m_mem_end) & mask);
        bitmap_size = (uint32 (m_mem_end - m_mem_start)) >> (PAGE_SHIFT + i);
        bitmap_size = (bitmap_size + 7) >> 3;
        bitmap_size = (bitmap_size + sizeof(uint32) - 1) & ~(sizeof(uint32)-1);
        m_free_area.free_list[i].map = (uint32 *) m_mem_start;
        memset((void *) m_mem_start, 0, bitmap_size);
        m_mem_start += bitmap_size;
    }
    m_free_area.base = (uint8*)(((uint32)m_mem_start + ~mask) & mask);

    init_pages();
    free_boot_mem();
}

初始化free area的时候,先给m_pages分配内存,然后将所有物理页的引用计数设为1,后面free_boot_mem的时候,会释放当前未使用的页,导致这些页的引用计数减1.

void mm_t::free_pages(void* addr, uint32 order)
{
    // dec the ref count, if it's not 0, don't free the pages
    if (!dec_page_ref(addr)) {
        return;
    }

    uint32 address = (uint32) addr;
    uint32 index = MAP_NR(address - (uint32)m_free_area.base) >> (1 + order);
    uint32 mask = PAGE_MASK << order;

    address &= mask;
    while (order < MAX_ORDER) {
        if (!change_bit(index, m_free_area.free_list[order].map)) {
            break;
        }

        uint32 buddy = get_buddy(address, mask);
        remove_head(m_free_area.free_list+order, (free_list_t *)buddy);
        order++;
        index >>= 1;
        mask <<= 1;
        address &= mask;
    }
    add_to_head(m_free_area.free_list+order, (free_list_t *) address);
}

void* mm_t::expand(free_list_t* addr, uint32 low, uint32 high)
{
    uint32 size = PAGE_SIZE << high;
    while (low < high) {
        high--;
        size >>= 1;
        add_to_head(m_free_area.free_list+high, addr);
        mark_used((uint32) addr, high);
        addr = (free_list_t *) (size + (uint32) addr);
    }
    inc_page_ref(addr);
    return addr;
}

void* mm_t::alloc_pages(uint32 order)
{
    free_list_t* queue = m_free_area.free_list + order;
    uint32 new_order = order;
    do {
        free_list_t* next = queue->next;
        if (queue != next) {
            queue->next = next->next;
            next->next->prev = queue;
            mark_used((uint32) next, new_order);
            return expand(next, order, new_order);
        }
        new_order++;
        queue++;
    } while (new_order <= MAX_ORDER);

    return NULL;
}

这样引用计数就做好了,fork的时候,拷贝页表时,将引用计数加1,并将页面设为只读:

uint32 vmm_t::copy_page_table(pde_t* pg_dir)
{
    m_pg_dir = (pde_t *) os()->get_mm()->alloc_pages(0);
    if (m_pg_dir == NULL) {
        return -1;
    }

    memcpy(m_pg_dir, pg_dir, PAGE_SIZE);
    for (uint32 i = 0; i < KERNEL_BASE/(4*MB); i++) {
        pde_t *pde = &pg_dir[i];
        if (!(*pde & PTE_P)) {
            continue;
        }

        pte_t* table = (pte_t *) PA2VA((*pde) & PAGE_MASK);
        pte_t* new_table = (pte_t *) os()->get_mm()->alloc_pages(0);
        if (new_table == NULL) {
            return -1;
        }

        /* inc page ref count and set write protected */
        for (uint32 j = 0; j < NR_PTE_PER_PAGE; j++) {
            if (table[j] & PTE_P) {
                os()->get_mm()->inc_page_ref((table[j] & PAGE_MASK));
                table[j] &= (~PTE_W);
            }
        }

        /* copy table and set to pg_dir */
        memcpy(new_table, table, PAGE_SIZE);
        m_pg_dir[i] = (VA2PA(new_table) | (*pde & (~PAGE_MASK)));
    }

    return 0;
}

init中做如下访问:

int main()
{
    uint32 cs = 0xffffffff;
    __asm__ volatile("movl %%cs, %%eax" : "=a" (cs));

    int i = 0;
    while (buffer[i] != '\0') {
        i++;
    }

    char num[9] = {0};
    int j = 0;
    while (cs != 0) {
        num[j++] = digits[cs % 16];
        cs /= 16;
    }

    while (j) {
        buffer[i++] = num[--j];
    }
    buffer[i++] = '\n';
    print(buffer);

    before_fork = (unsigned *) mmap(0, 1024, 0, 0);
    for (unsigned int i = 0; i < 16; i++) {
        before_fork[i] = i;
    }

    // fork
    int32 ret = fork();
    if (ret == 0) {
        // child
        buffer2[0] = 'I';
        buffer2[1] = 'C';
        buffer2[2] = ',';
        buffer2[3] = '\0';
        while (1) {
            for (int i = 0; i < 100000000; i++) ;
            print(buffer2);

            times2++;
            if (times2 == 10) {
                print(test2);
            }
        }
    }
    else {
        // parent
        buffer[0] = 'I';
        buffer[1] = ',';
        buffer[2] = '\0';

        while (1) {
            for (int i = 0; i < 100000000; i++) ;
            print(buffer);

            times++;
            if (times == 15) {
                print(test);

                for (unsigned int i = 0; i < 16; i++) {
                    before_fork[i] = i; 
                } 
            }
        }
    }

    return 0;
}

在fork之前mmap一块内存before_fork,这样fork的时候子进程会拷贝相应的vm_area及页表,但是同时会把这些页设成只读的,父进程再访问这些页时,会发生保护错误

这里写图片描述

处理这种错误,发生权限问题时,尝试分配新页面并建立映射:

uint32 vmm_t::do_protection_fault(vm_area_t* vma, uint32 addr, uint32 write)
{
    console()->kprintf(YELLOW, "handle protection fault, addr: %x\n", addr);

    /* not shared */
    if (os()->get_mm()->get_page_ref(os()->get_mm()->va_2_pa((void *) addr)) <= 1) {
        console()->kprintf(RED, "protection fault, ref count: %u!\n", 
                os()->get_mm()->get_page_ref(os()->get_mm()->va_2_pa((void *) addr)));
        return -1;
    }

    /* this page is shared, now only COW can share page */
    void* mem = os()->get_mm()->alloc_pages(0);
    os()->get_mm()->map_pages(m_pg_dir, (void*) addr, VA2PA(mem), PAGE_SIZE, PTE_W | PTE_U);
    console()->kprintf(GREEN, "alloc and map pages\n");
}

/*
 * bit 0: 0 no page found, 1 protection fault
 * bit 1: 0 read, 1 write
 * bit 2: 0 kernel, 1 user
 */
uint32 vmm_t::do_page_fault(trap_frame_t* frame)
{
    uint32 addr = 0xffffffff;
    __asm__ volatile("movl %%cr2, %%eax" : "=a" (addr));

    addr = (addr & PAGE_MASK);
    vm_area_t* vma = find_vma(addr);

    /* find a vma and the addr in this vma */
    if (vma != NULL && vma->m_start <= addr) {
        if (frame->err & 0x1) {
            return do_protection_fault(vma, addr, (uint32) (frame->err & 2));
        }

        console()->kprintf(YELLOW, "handle no page, addr: %x\n", addr);

        /* no page found */
        void* mem = os()->get_mm()->alloc_pages(0);

        console()->kprintf(YELLOW, "addr: %x, map page: %x\n", addr, os()->get_mm()->va_2_pa(mem));

        os()->get_mm()->map_pages(m_pg_dir, (void*) addr, VA2PA(mem), PAGE_SIZE, PTE_W | PTE_U);
        console()->kprintf(GREEN, "alloc and map pages\n");
    }
    else {
        /* not find the vma */
        if (frame->err & 0x4) {
            console()->kprintf(RED, "segment fault!\n");
        }
        return -1;
    }

    return 0;
}

这里写图片描述

当然,上面的代码只是一种测试,还需要做的:
1)发生写保护错误时,只有当前vma的flags具有写标记,才做这种映射
2)对于引用计数大于1的页面,要重新分配页面,等于1的只需要修改页表属性
3)复制页内容,真正的copy on write

uint32 vmm_t::do_protection_fault(vm_area_t* vma, uint32 addr, uint32 write)
{
    console()->kprintf(YELLOW, "handle protection fault, addr: %x, ref count: %u\n", 
        addr, os()->get_mm()->get_page_ref(os()->get_mm()->va_2_pa((void *) addr)));

    if (write && !(vma->m_flags & VM_WRITE)) {
        console()->kprintf(RED, "protection fault, ref count: %u!\n", 
                os()->get_mm()->get_page_ref(os()->get_mm()->va_2_pa((void *) addr)));
        return -1;
    }

    /* not shared */
    if (os()->get_mm()->get_page_ref(os()->get_mm()->va_2_pa((void *) addr)) == 1) {
        make_pte_write((void *) addr);
        return 0;
    }

    /* this page is shared, now only COW can share page */
    void* mem = os()->get_mm()->alloc_pages(0);
    os()->get_mm()->copy_page(mem, (void *) addr);
    os()->get_mm()->free_pages((void *) (PA2VA(os()->get_mm()->va_2_pa((void *) addr))), 0);

    os()->get_mm()->map_pages(m_pg_dir, (void*) addr, VA2PA(mem), PAGE_SIZE, PTE_W | PTE_U);
    console()->kprintf(GREEN, "alloc, copy and map page\n");

    return 0;
}

/*
 * bit 0: 0 no page found, 1 protection fault
 * bit 1: 0 read, 1 write
 * bit 2: 0 kernel, 1 user
 */
uint32 vmm_t::do_page_fault(trap_frame_t* frame)
{
    uint32 addr = 0xffffffff;
    __asm__ volatile("movl %%cr2, %%eax" : "=a" (addr));

    addr = (addr & PAGE_MASK);
    vm_area_t* vma = find_vma(addr);

    /* find a vma and the addr in this vma */
    if (vma != NULL && vma->m_start <= addr) {
        if (frame->err & 0x1) {
            return do_protection_fault(vma, addr, (uint32) (frame->err & 2));
        }

        console()->kprintf(YELLOW, "handle no page, addr: %x\n", addr);

        /* no page found */
        void* mem = os()->get_mm()->alloc_pages(0);

        console()->kprintf(YELLOW, "addr: %x, map page: %x\n", addr, os()->get_mm()->va_2_pa(mem));

        os()->get_mm()->map_pages(m_pg_dir, (void*) addr, VA2PA(mem), PAGE_SIZE, PTE_W | PTE_U);
        console()->kprintf(GREEN, "alloc and map pages\n");
    }
    else {
        /* not find the vma or out of range */
        if (frame->err & 0x4) {
            console()->kprintf(RED, "segment fault, addr: %x!\n", addr);
        }
        return -1;
    }

    return 0;
}

void vmm_t::make_pte_write(void* va)
{
    if ((uint32) va >= KERNEL_BASE) {
        return;
    }

    pde_t *pde = &m_pg_dir[PD_INDEX(va)];
    if (!(*pde & PTE_P)) {
        return;
    }

    pte_t* table = (pte_t *) PA2VA((*pde) & PAGE_MASK);
    if (!table[PT_INDEX(va)] & PTE_P) {
        return;
    }

    table[PT_INDEX(va)] |= PTE_W;
}

当完成拷贝之后,父子进程同一个地址对应的物理页已完全分离,可以设置不同的值,简单的测试如下:

int main()
{
    uint32 cs = 0xffffffff;
    __asm__ volatile("movl %%cs, %%eax" : "=a" (cs));

    int i = 0;
    while (buffer[i] != '\0') {
        i++;
    }

    char num[9] = {0};
    int j = 0;
    while (cs != 0) {
        num[j++] = digits[cs % 16];
        cs /= 16;
    }

    while (j) {
        buffer[i++] = num[--j];
    }
    buffer[i++] = '\n';
    print(buffer);

    before_fork = (unsigned *) mmap(0, 4096, PROT_READ | PROT_WRITE, 0);
    for (unsigned int i = 0; i < 1024; i++) {
        before_fork[i] = i;
    }

    // fork
    int32 ret = fork();
    if (ret == 0) {
        // child
        buffer2[0] = 'I';
        buffer2[1] = 'C';
        buffer2[2] = ',';
        buffer2[3] = '\0';
        while (1) {
            for (int i = 0; i < 100000000; i++) ;
            print(buffer2);

            times2++;
            if (times2 == 10) {
                print(test2);
                for (unsigned int i = 100; i < 110; i++) {
                    before_fork[i] = i+1;
                }
                for (unsigned int i = 0; i < 16; i++) {
                    before_fork[i] = 100*i;
                    print_int(before_fork[i], 16, 0);
                }
            }
        }
    }
    else {
        // parent
        buffer[0] = 'I';
        buffer[1] = ',';
        buffer[2] = '\0';

        while (1) {
            for (int i = 0; i < 100000000; i++) ;
            print(buffer);

            times++;
            if (times == 15) {
                print(test);
                for (unsigned int i = 200; i < 210; i++) {
                    before_fork[i] = i;
                }
                for (unsigned int i = 0; i < 16; i++) {
                    print_int(before_fork[i], 16, 0);
                }
            }
        }
    }

    return 0;
}

这里写图片描述

开始时map一块地址并赋值,子进程会修改这些值并打印证明修改成功,一会之后父进程会打印这些值,得到子进程修改前的内容,证明父子进程对应的物理页已完全分开。

至此一个简单的COW就完成了。下面计划做的是:
1)用户栈vma
2)用户栈的扩展
3)exec系统调用加载elf文件
4)exec时释放旧的vma及页面映射
5)brk

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值