Part A: Multiprocessor Support and Cooperative Multitasking
Multiprocessor Support
each CPU has a local APIC(LAPIC)unit
The LAPIC units are responsible for delivering interrupts throughout the system. The LAPIC also provides its connected CPU with a unique identifier.
memory-mapped I/O: In MMIO, a portion of physical memory is hardwired to the registers of some I/O devices, so the same load/store instructions typically used to access memory can be used to access device registers.
Exercise1
size_t rounded_size = ROUNDUP(size, PGSIZE);
if (base + rounded_size >= MMIOLIM) {
panic("mmio_map_region: requested size overflow MMIOLIM")
}
boot_map_region(kern_pgdir, base, rounded_size, pa, PTE_PCD | PTE_PWT | PTE_W);
uintptr_t curr_base = base;
base += rounded_size;
return (void *)curr_base;
Application Processor Bootstrap
Exercise2
MPENTRY_PADDR
是0x7000
,高于PGSIZE
小于IOPHYSMEM
,所以应该在此处增加判断
for (i = 1; i < npages_basemem; i++) {
if (i == MPENTRY_PADDR / PGSIZE)
MARK_USE(i);
else
MARK_FREE(i);
}
Question1
Per-CPU State and Initialization
Per-CPU kernel stack
Per-CPU TSS and TSS descriptor
Per-CPU current environment pointer
Per-CPU system registers
Exercise3
for(int i = 0; i < NCPU; i++) {
uint32_t kstacktop_i = KSTACKTOP - i * (KSTKSIZE + KSTKGAP);
boot_map_region(kern_pgdir, kstacktop_i - KSTKSIZE, KSTKSIZE, PADDR(percpu_kstacks[i]), PTE_W);
}
Exercise4
for (int i = 0; i < NCPU; i++) {
int id = thiscpu->cpu_id;
// Setup a TSS so that we get the right stack
// when we trap to the kernel.
thiscpu->cpu_ts.ts_esp0 = (uint32_t)percpu_kstacks[id] + KSTKSIZE;
thiscpu->cpu_ts.ts_ss0 = GD_KD;
thiscpu->cpu_ts.ts_iomb = sizeof(struct Taskstate);
// Initialize the TSS slot of the gdt.
gdt[(GD_TSS0 >> 3) + id] = SEG16(STS_T32A, (uint32_t) (&thiscpu->cpu_ts),
sizeof(struct Taskstate) - 1, 0);
gdt[(GD_TSS0 >> 3) + id].sd_s = 0;
// Load the TSS selector (like other segment selectors, the
// bottom three bits are special; we leave them 0)
ltr(GD_TSS0);
// Load the IDT
lidt(&idt_pd);
}
Locking
Exercise5
@@ -43,6 +43,7 @@ i386_init(void)
// Acquire the big kernel lock before waking up APs
// Your code here:
+ lock_kernel();
@@ -109,6 +110,8 @@ mp_main(void)
// only one CPU can enter the scheduler at a time!
//
// Your code here:
+ lock_kernel();
+ sched_yield();
@@ -293,6 +298,7 @@ trap(struct Trapframe *tf)
// Acquire the big kernel lock before doing any
// serious kernel work.
// LAB 4: Your code here.
+ lock_kernel();
assert(curenv);
@@ -563,8 +563,10 @@ env_run(struct Env *e)
curenv = e;
e->env_status = ENV_RUNNING;
e->env_runs++;
+ unlock_kernel();
lcr3(PADDR(e->env_pgdir));
env_pop_tf(&e->env_tf);
Question2
举个例子,CPU0在处理异常或中断时,将TrapFrame
压入了内核栈,而另外一个CPU1,也将TrapFrame
压入了同一个内核栈,这种情况下,CPU0退出异常时,弹出的TrapFrame
并不是CPU0的,而是CPU1的
Round-Robin Scheduling
Exercise6
int idle_envid = (idle == NULL) ? -1 : ENVX(idle->env_id);
int i;
for (i = idle_envid + 1; i < NENV; i++) {
if (envs[i].env_status == ENV_RUNNABLE) {
env_run(&envs[i]);
}
}
for (i = 0; i < idle_envid; i++) {;
if (envs[i].env_status == ENV_RUNNABLE) {
env_run(&envs[i]);
}
}
// if still not found, try idle
if(idle != NULL && idle->env_status == ENV_RUNNING) {
env_run(idle);
}
case SYS_yield:
sys_yield();
return 0;
#if defined(TEST)
// Don't touch -- used by grading script!
ENV_CREATE(TEST, ENV_TYPE_USER);
#else
// Touch all you want.
// ENV_CREATE(user_primes, ENV_TYPE_USER);
#endif // TEST*
ENV_CREATE(user_yield, ENV_TYPE_USER);
ENV_CREATE(user_yield, ENV_TYPE_USER);
ENV_CREATE(user_yield, ENV_TYPE_USER);
Question3
所有的env
都是在kern_pgdir
建立的【详见env_setup_vm()】,因此e作为指针参数,在不同的env
中都指向同一个地址,在[UENVS, UENVS+PTSIZE]
地址空间
Question4
触发sys_yield
系统调用时,会将寄存器内容压入栈中,并且在kern/trap.c:trap()
中保存到env_tf
,然后通过env_pop_tf
恢复寄存器状态
System Calls for Environment Creation
Exercise7
fork()
的实现在于创建一个环境并且进行环境复制(tf),以至于子进程也像调用了sys_exofork【复制了栈帧位置和程序的eip指令指针等(注意eip指向用户程序中sys_exofork系统调用的下一条指令,相当于子进程被调度后,eax直接读取的是0,不是通过sys_exofork->syscall->trap_dispatch的return返回而赋值给regs->reg_eax的,所以子进程是进行了fake的sys_exofork,只需要简单保存父进程的env_tf即可)】,并且因为设置了eax,其返回0(从而可以区分父子进程)。
注意tf中保存的是用户态栈帧的状态,而不是实时跟随父进程。也就是tf保存的父进程进入kernel之前的regs情况,所以tf->eip不等于现在的eip,tf->eip指向的是用户态调用fork之后的那个指令,而eip随时跟踪程序状态,指向了kernel中的fork函数内的指令。
因此子进程在被调度的时候,直接从用户态继续执行,这时候通过tf恢复用户态,会直接读取tf->eax作为返回值。而父进程的tf->eax在kernel的syscall()返回时被设置为子进程pid,之后被调度则读取tf->eax作为返回值,为child_pid。【因此fork函数就是通过这种方式来实现父子进程的返回值不同的】
static envid_t
sys_exofork(void)
{
struct Env *e;
int r;
if ((r = env_alloc(&e, curenv->env_id)) != 0) {
return r;
}
e->env_status = ENV_NOT_RUNNABLE;
e->env_tf = curenv->env_tf;
e->env_tf.tf_regs.reg_eax = 0; // return 0 to child
return e->env_id;
}
static int
sys_env_set_status(envid_t envid, int status)
{
struct Env *e;
int r;
if (status != ENV_RUNNABLE && status != ENV_NOT_RUNNABLE) {
return -E_INVAL;
}
if ((r = envid2env(envid, &e, 1)) != 0) {
return r;
}
e->env_status = status;
return 0;
}
static int
sys_page_alloc(envid_t envid, void *va, int perm)
{
struct Env *e;
struct PageInfo *pp;
int r;
if ((uint32_t)va >= UTOP || PGOFF(va) != 0) {
return -E_INVAL;
}
if ((perm & (PTE_U | PTE_P)) != (PTE_U | PTE_P)) {
return -E_INVAL;
}
if ((perm & ~(PTE_SYSCALL)) != 0) {
return -E_INVAL;
}
if ((r = envid2env(envid, &e, 1)) != 0) {
return r;
}
if((pp = page_alloc(perm)) == NULL) {
return -E_NO_MEM;
}
if((r = page_insert(e->env_pgdir, pp, va, perm)) != 0) {
page_free(pp);
return -E_NO_MEM;
}
return 0;
}
static int
sys_page_map(envid_t srcenvid, void *srcva,
envid_t dstenvid, void *dstva, int perm)
{
struct Env *srcenv, *dstenv;
struct PageInfo *pp;
pte_t *pte;
int r;
if ((uint32_t)srcva >= UTOP || PGOFF(srcva) != 0) {
return -E_INVAL;
}
if ((uint32_t)dstva >= UTOP || PGOFF(dstva) != 0) {
return -E_INVAL;
}
if ((perm & (PTE_U | PTE_P)) != (PTE_U | PTE_P)) {
return -E_INVAL;
}
if ((perm & ~(PTE_SYSCALL)) != 0) {
return -E_INVAL;
}
if ((r = envid2env(srcenvid, &srcenv, 1)) != 0) {
return r;
}
if ((r = envid2env(dstenvid, &dstenv, 1)) != 0) {
return r;
}
if ((pp = page_lookup(srcenv->env_pgdir, srcva, &pte)) == NULL) {
return -E_INVAL;
}
if ((*pte & PTE_W) == 0 && (perm & PTE_W) == PTE_W) {
return -E_INVAL;
}
if ((r = page_insert(dstenv->env_pgdir, pp, dstva, perm)) != 0) {
return r;
}
return 0;
}
Part B: Copy-on-Write Fork
User-level page fault handling
Exercise8
static int
sys_env_set_pgfault_upcall(envid_t envid, void *func)
{
struct Env *e;
int r;
if ((r = envid2env(envid, &e, 1)) != 0) {
return r;
}
e->env_pgfault_upcall = func;
return 0;
}
case SYS_env_set_pgfault_upcall:
return sys_env_set_pgfault_upcall(a1, (void *)a2);
Exercise9
void
page_fault_handler(struct Trapframe *tf)
{
uint32_t fault_va;
// Read processor's CR2 register to find the faulting address
fault_va = rcr2();
// Handle kernel-mode page faults.
if ((tf->tf_cs & 0x3) == 0) {
panic("page_fault_handler: page fault in kernel mode");
}
// We've already handled kernel-mode exceptions, so if we get here,
// the page fault happened in user mode.
if (curenv->env_pgfault_upcall) {
struct UTrapframe *utf;
// Determine the location
if (tf->tf_esp >= UXSTACKTOP - PGSIZE && tf->tf_esp < UXSTACKTOP) {
*(uint32_t *)(tf->tf_esp - 4) = 0; // push an empty 32-bit word
utf = (struct UTrapframe *)(tf->tf_esp - 4 - sizeof(struct UTrapframe));
} else {
utf = (struct UTrapframe *)(UXSTACKTOP - sizeof(struct UTrapframe));
}
// Check permission
user_mem_assert(curenv, (void *)utf, sizeof(struct UTrapframe), PTE_W | PTE_U);
// Set up the user trap frame
utf->utf_esp = tf->tf_esp;
utf->utf_eflags = tf->tf_eflags;
utf->utf_eip = tf->tf_eip;
utf->utf_regs = tf->tf_regs;
utf->utf_err = tf->tf_err;
utf->utf_fault_va = fault_va;
// Switch the environment
tf->tf_esp = (uint32_t)utf;
tf->tf_eip = (uint32_t)curenv->env_pgfault_upcall;
env_run(curenv);
}
// Destroy the environment that caused the fault.
cprintf("[%08x] user fault va %08x ip %08x\n",
curenv->env_id, fault_va, tf->tf_eip);
print_trapframe(tf);
env_destroy(curenv);
}
Exercise10
// Save trap-time eip next to previous stack (that's why we need the empty dword)
movl 0x30(%esp), %ecx // save trap-time esp in ecx
subl $4, %ecx // enlarge the previous stack for 4 bytes
movl %ecx, 0x30(%esp) // write the modified esp back
movl 0x28(%esp), %edx // save trap-time eip in edx
movl %edx, (%ecx) // save eip at new esp for return
// Restore the trap-time registers. After you do this, you
// can no longer modify any general-purpose registers.
addl $8, %esp // skip fault_va and tf_err
popal // pop PushRegs
// Restore eflags from the stack. After you do this, you can
// no longer use arithmetic operations or anything else that
// modifies eflags.
addl $4, %esp // skip eip
popfl // pop eflags
// Switch back to the adjusted trap-time stack.
pop %esp
// Return to re-execute the instruction that faulted.
ret
Exercise11
void
set_pgfault_handler(void (*handler)(struct UTrapframe *utf))
{
int r;
if (_pgfault_handler == 0) {
// First time through!
if ((r = sys_page_alloc(thisenv->env_id, (void *)(UXSTACKTOP - PGSIZE), PTE_W | PTE_U | PTE_P)) != 0) {
panic("set_pgfault_handler: %e", r);
}
if ((r = sys_env_set_pgfault_upcall(thisenv->env_id, _pgfault_upcall)) != 0) {
panic("set_pgfault_handler: %e", r);
}
}
// Save handler pointer for assembly to call.
_pgfault_handler = handler;
}
Implementing Copy-on-Write Fork
Exercise12
envid_t
fork(void)
{
envid_t envid;
uint32_t addr;
int r;
set_pgfault_handler(pgfault);
envid = sys_exofork();
if (envid < 0) {
panic("sys_exofork: %e", envid);
}
if (envid == 0) {
// fix thisenv in child
thisenv = &envs[ENVX(sys_getenvid())];
return 0;
}
// copy the address space mappings to child
for (addr = 0; addr < USTACKTOP; addr += PGSIZE) {
if ((uvpd[PDX(addr)] & PTE_P) == PTE_P && (uvpt[PGNUM(addr)] & PTE_P) == PTE_P) {
duppage(envid, PGNUM(addr));
}
}
// allocate new page for child's user exception stack
void _pgfault_upcall();
if ((r = sys_page_alloc(envid, (void *)(UXSTACKTOP - PGSIZE), PTE_W | PTE_U | PTE_P)) != 0) {
panic("fork: %e", r);
}
if ((r = sys_env_set_pgfault_upcall(envid, _pgfault_upcall)) != 0) {
panic("fork: %e", r);
}
// mark the child as runnable
if ((r = sys_env_set_status(envid, ENV_RUNNABLE)) != 0)
panic("fork: %e", r);
return envid;
}
static int
duppage(envid_t envid, unsigned pn)
{
envid_t parent_envid = sys_getenvid();
void *va = (void *)(pn * PGSIZE);
int r;
if ((uvpt[pn] & PTE_W) == PTE_W || (uvpt[pn] & PTE_COW) == PTE_COW) {
if ((r = sys_page_map(parent_envid, va, envid, va, PTE_COW | PTE_U | PTE_P)) != 0) {
panic("duppage: %e", r);
}
if ((r = sys_page_map(parent_envid, va, parent_envid, va, PTE_COW | PTE_U | PTE_P)) != 0) {
panic("duppage: %e", r);
}
} else {
if ((r = sys_page_map(parent_envid, va, envid, va, PTE_U | PTE_P)) != 0) {
panic("duppage: %e", r);
}
}
return 0;
}
static void
pgfault(struct UTrapframe *utf)
{
void *addr = (void *) utf->utf_fault_va;
uint32_t err = utf->utf_err;
pte_t pte = uvpt[PGNUM(addr)];
envid_t envid = sys_getenvid();
int r;
if ((err & FEC_WR) == 0 || (pte & PTE_COW) == 0) {
panic("pgfault: bad faulting access\n");
}
if ((r = sys_page_alloc(envid, PFTEMP, PTE_W | PTE_U | PTE_P)) != 0) {
panic("pgfault: %e", r);
}
memcpy(PFTEMP, ROUNDDOWN(addr, PGSIZE), PGSIZE);
if ((r = sys_page_map(envid, PFTEMP, envid, ROUNDDOWN(addr, PGSIZE), PTE_W | PTE_U | PTE_P)) != 0) {
panic("pgfault: %e", r);
}
if ((r = sys_page_unmap(envid, PFTEMP)) != 0) {
panic("pgfault: %e", r);
}
}
Part C: Preemptive Multitasking and Inter-Process communication (IPC)
Clock Interrupts and Preemption
Exercise13
TRAPHANDLER_NOEC(th_irq_timer, IRQ_OFFSET + IRQ_TIMER)
TRAPHANDLER_NOEC(th_irq_kbd, IRQ_OFFSET + IRQ_KBD)
TRAPHANDLER_NOEC(th_irq_serial, IRQ_OFFSET + IRQ_SERIAL)
TRAPHANDLER_NOEC(th_irq_spurious, IRQ_OFFSET + IRQ_SPURIOUS)
TRAPHANDLER_NOEC(th_irq_ide, IRQ_OFFSET + IRQ_IDE)
TRAPHANDLER_NOEC(th_irq_error, IRQ_OFFSET + IRQ_ERROR)
void th_irq_timer();
void th_irq_kbd();
void th_irq_serial();
void th_irq_spurious();
void th_irq_ide();
void th_irq_error();
SETGATE(idt[IRQ_OFFSET + IRQ_TIMER], 0, GD_KT, &th_irq_timer, 0);
SETGATE(idt[IRQ_OFFSET + IRQ_KBD], 0, GD_KT, &th_irq_kbd, 0);
SETGATE(idt[IRQ_OFFSET + IRQ_SERIAL], 0, GD_KT, &th_irq_serial, 0);
SETGATE(idt[IRQ_OFFSET + IRQ_SPURIOUS], 0, GD_KT, &th_irq_spurious, 0);
SETGATE(idt[IRQ_OFFSET + IRQ_IDE], 0, GD_KT, &th_irq_ide, 0);
SETGATE(idt[IRQ_OFFSET + IRQ_ERROR], 0, GD_KT, &th_irq_error, 0);
kern/env.c:env_alloc()
e->env_tf.tf_eflags |= FL_IF;
去掉kern/sched.c:sched_halt()中的注释sti
Exercise14
if (tf->tf_trapno == IRQ_OFFSET + IRQ_TIMER) {
lapic_eoi();
sched_yield();
return;
}
Inter-Process communication (IPC)
Exercise15
static int
sys_ipc_recv(void *dstva)
{
if ((uint32_t)dstva < UTOP && PGOFF(dstva) != 0) {
return -E_INVAL;
}
curenv->env_ipc_recving = 1;
curenv->env_ipc_dstva = dstva;
curenv->env_status = ENV_NOT_RUNNABLE;
return 0;
}
static int
sys_ipc_try_send(envid_t envid, uint32_t value, void *srcva, unsigned perm)
{
struct Env *e;
struct PageInfo *pp;
pte_t *pte;
int r;
if ((r = envid2env(envid, &e, 0)) != 0) {
return r;
}
if (e->env_ipc_recving == 0) {
return -E_IPC_NOT_RECV;
}
if ((uint32_t)srcva < UTOP) {
if (PGOFF(srcva) != 0) {
return -E_INVAL;
}
if ((perm & (PTE_U | PTE_P)) != (PTE_U | PTE_P)) {
return -E_INVAL;
}
if ((perm & ~(PTE_SYSCALL)) != 0) {
return -E_INVAL;
}
if ((pp = page_lookup(curenv->env_pgdir, srcva, &pte)) == NULL) {
return -E_INVAL;
}
if ((*pte & PTE_W) == 0 && (perm & PTE_W) == PTE_W) {
return -E_INVAL;
}
if ((r = page_insert(e->env_pgdir, pp, e->env_ipc_dstva, perm)) != 0) {
return r;
}
e->env_ipc_perm = perm;
} else {
e->env_ipc_perm = 0;
}
e->env_ipc_recving = 0;
e->env_ipc_from = curenv->env_id;
e->env_ipc_value = value;
e->env_status = ENV_RUNNABLE;
return 0;
}
case SYS_ipc_recv:
return sys_ipc_recv((void *)a1);
case SYS_ipc_try_send:
return sys_ipc_try_send(a1, a2, (void *)a3, a4);
int32_t
ipc_recv(envid_t *from_env_store, void *pg, int *perm_store)
{
int r;
if (pg == NULL) {
pg = (void *)UTOP;
}
if ((r = sys_ipc_recv(pg)) < 0) {
if (from_env_store != NULL) {
*from_env_store = 0;
}
if (perm_store != NULL) {
*perm_store = 0;
}
return r;
}
if (from_env_store != NULL) {
*from_env_store = thisenv->env_ipc_from;
}
if (perm_store != NULL) {
*perm_store = thisenv->env_ipc_perm;
}
return thisenv->env_ipc_value;
}
void
ipc_send(envid_t to_env, uint32_t val, void *pg, int perm)
{
int r;
if (pg == NULL) {
pg = (void *)UTOP;
}
do {
r = sys_ipc_try_send(to_env, val, pg, perm);
if (r < 0 && r != -E_IPC_NOT_RECV) {
panic("ipc_send: %e", r);
}
sys_yield();
} while(r != 0);
}