前言
本文对进程创建涉及的内核动作加以分析,希望对这方面感兴趣的各位有帮助
正传
在FreeBSD系统里面关于创建进程的系统调用有: (kern_fork.c)
fork, rfork, vfork 对应于内核的是实现是:sys_fork, sys_rfork, sys_vfork, 其实这三个内核实现都是以不同的flag来调用 fork1 其中与内存有关的较重要的函数之一是 vmspace_fork
vfork 是很特殊的fork, 子进程完全和父进程共享空间, 然后还需要父进程挂起等待子进程完成工作, 有结构缺陷是子进程可以随意改动父进程的地址空间,如果出错可能会搞崩父进程;
sys_fork, sys_vfork, sys_rfork ----------> fork1------>vmspace_fork; do_fork ------>vm_forkproc;
fork1 里面, 如果是vfork调进来的则 RFMEM (share address space)是置上的,所以在 vm_forkproc 里面才会有 如下语句:
if (flags & RFMEM) {
p2->p_vmspace = p1->p_vmspace; -------------------> p2 子进程与p1父进程完全公用1个地址空间(vmspace);
atomic_add_int(&p1->p_vmspace->vm_refcnt, 1); ---------> vmspace 在申请出来时计数是1, 如果有被share的时候 就 +1
}
而常规的fork (sys_fork (也是在kern_fork.c里面)) 就会在fork1 里面有如下语句来创建vmspace:
if ((flags & RFMEM) == 0) {
vm2 = vmspace_fork(p1->p_vmspace, &mem_charged); ---------->以父进程p1的vmspace为蓝本 创建子进程的vmspace; vm_forkproc 主要做两件事1, 把vmspace 挂接到进程上。2,调用cpu_fork;
if (vm2 == NULL) {
error = ENOMEM;
goto fail1;
}
} else { --------------->vfork 和父进程共用地址空间;
vm2 = NULL;
}
所以在创建进程时除了vfork直接沿用父进程外, 大多数流程在创建地址空间时 vmspace_fork 函数是绝对的核心函数:
/*
-
vmspace_fork:
-
Create a new process vmspace structure and vm_map
-
based on those of an existing process. The new map
-
is based on the old map, according to the inheritance
-
values on the regions in that map.
-
XXX It might be worth coalescing the entries added to the new vmspace.
-
The source map must not be locked.
*/
struct vmspace *
vmspace_fork(struct vmspace *vm1, vm_ooffset_t *fork_charge)
{
struct vmspace *vm2;
vm_map_t new_map, old_map;
vm_map_entry_t new_entry, old_entry;
vm_object_t object;
int locked;old_map = &vm1->vm_map;
/* Copy immutable fields of vm1 to vm2. /
vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset, NULL);
if (vm2 == NULL)
return (NULL);
vm2->vm_taddr = vm1->vm_taddr;
vm2->vm_daddr = vm1->vm_daddr;
vm2->vm_maxsaddr = vm1->vm_maxsaddr;
vm_map_lock(old_map);
if (old_map->busy)
vm_map_wait_busy(old_map);
new_map = &vm2->vm_map;
locked = vm_map_trylock(new_map); / trylock to silence WITNESS */
KASSERT(locked, (“vmspace_fork: lock failed”));old_entry = old_map->header.next;
while (old_entry != &old_map->header) { --------------------> old_xxx 代表父进程, 此循环代表遍历父进程的所有地址空间vm_map {entry1, entry2, entry3, entry4}
if (old_entry->eflags & MAP_ENTRY_IS_SUB_MAP)
panic(“vm_map_fork: encountered a submap”);
/*
遍历父进程遵循的原则可以从FreeBSD 操作系统设计与实现 第二版的6.6(对应第一版的5.6.2)中找到, 摘抄如下:
Using copy-on-write for fork is done by traversing the list of vm_map_entry structures in the
parent and creating a corresponding entry in the child. Each entry must be analyzed and the
appropriate action taken:
• If the entry maps a shared region, the child can take a reference to it.
• If the entry maps a privately mapped region (such as the data area or stack), the child must
create a copy-on-write mapping of the region. The parent must be converted to a copy-on-write
mapping of the region. If either process later tries to write the region, it will create a shadow
object to hold the modified pages.
With the virtual-memory resources allocated, the system sets up the kernel-and user-mode state
of the new process. It then clears the NEW flag and places the process’s thread on the run queue;
the new process can then begin execution.
中文翻译如下:
/
/ vmspace_fork 函数继续 -----------------------------------*/
switch (old_entry->inheritance) {
case VM_INHERIT_NONE:
break;
case VM_INHERIT_SHARE: ----------------->对应上面文字说明的只读或者共享区域;
/*
* Clone the entry, creating the shared object if necessary.
*/
object = old_entry->object.vm_object;
if (object == NULL) {
object = vm_object_allocate(OBJT_DEFAULT,
atop(old_entry->end - old_entry->start));
old_entry->object.vm_object = object;
old_entry->offset = 0;
if (old_entry->cred != NULL) {
object->cred = old_entry->cred;
object->charge = old_entry->end -
old_entry->start;
old_entry->cred = NULL;
}
}
/*
* Add the reference before calling vm_object_shadow
* to insure that a shadow object is created.
*/
vm_object_reference(object);
if (old_entry->eflags & MAP_ENTRY_NEEDS_COPY) {
vm_object_shadow(&old_entry->object.vm_object,
&old_entry->offset,
old_entry->end - old_entry->start);
old_entry->eflags &= ~MAP_ENTRY_NEEDS_COPY;
/* Transfer the second reference too. */
vm_object_reference(
old_entry->object.vm_object);
/*
* As in vm_map_simplify_entry(), the
* vnode lock will not be acquired in
* this call to vm_object_deallocate().
*/
vm_object_deallocate(object);
object = old_entry->object.vm_object;
}
VM_OBJECT_WLOCK(object);
vm_object_clear_flag(object, OBJ_ONEMAPPING);
if (old_entry->cred != NULL) {
KASSERT(object->cred == NULL, ("vmspace_fork both cred"));
object->cred = old_entry->cred;
object->charge = old_entry->end - old_entry->start;
old_entry->cred = NULL;
}
/*
* Assert the correct state of the vnode
* v_writecount while the object is locked, to
* not relock it later for the assertion
* correctness.
*/
if (old_entry->eflags & MAP_ENTRY_VN_WRITECNT &&
object->type == OBJT_VNODE) {
KASSERT(((struct vnode *)object->handle)->
v_writecount > 0,
("vmspace_fork: v_writecount %p", object));
KASSERT(object->un_pager.vnp.writemappings > 0,
("vmspace_fork: vnp.writecount %p",
object));
}
VM_OBJECT_WUNLOCK(object);
/*
* Clone the entry, referencing the shared object.
*/
new_entry = vm_map_entry_create(new_map);
*new_entry = *old_entry; ------------------>虚拟空间复制
new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
MAP_ENTRY_IN_TRANSITION);
new_entry->wiring_thread = NULL;
new_entry->wired_count = 0;
if (new_entry->eflags & MAP_ENTRY_VN_WRITECNT) {
vnode_pager_update_writecount(object,
new_entry->start, new_entry->end);
}
/*
* Insert the entry into the new map -- we know we're
* inserting at the end of the new map.
*/
vm_map_entry_link(new_map, new_map->header.prev,
new_entry);
vmspace_map_entry_forked(vm1, vm2, new_entry);
/*
* Update the physical map
*/
pmap_copy(new_map->pmap, old_map->pmap, ------------------>页表复制
new_entry->start,
(old_entry->end - old_entry->start),
old_entry->start);
break;
case VM_INHERIT_COPY: ----------------->对应上面文字说明的是私有映射区域(如数据端和堆栈);
/*
* Clone the entry and link into the map.
*/
new_entry = vm_map_entry_create(new_map);
*new_entry = *old_entry; ------------------>虚拟空间复制
/*
* Copied entry is COW over the old object.
*/
new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED |
MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_VN_WRITECNT);
new_entry->wiring_thread = NULL;
new_entry->wired_count = 0;
new_entry->object.vm_object = NULL;
new_entry->cred = NULL;
vm_map_entry_link(new_map, new_map->header.prev,
new_entry);
vmspace_map_entry_forked(vm1, vm2, new_entry);
vm_map_copy_entry(old_map, new_map, old_entry, ----------------------------->主要做三件事情: 1 把相应的物理页置成不可以写(即写保护);2 把父子进程响应的vm_map_entry 设置标记(MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY) 3 复制页表;
new_entry, fork_charge);
break;
}
old_entry = old_entry->next;
}
/*
* Use inlined vm_map_unlock() to postpone handling the deferred
* map entries, which cannot be done until both old_map and
* new_map locks are released.
*/
sx_xunlock(&old_map->lock);
sx_xunlock(&new_map->lock);
vm_map_process_deferred();
return (vm2);
}
上图中的程序头结构定义了一群保护属性相同的section, 如文件头里面有11个session, 但是链接之后把所有的保护属性相同的session合并到一个段里面, 所以就有__elfN(imgact)) 代码中根据p_phnum 为单元装载各个段;
上图中红圈对应的各个列名就是上上个图的程序头表, 而且从图中可以看到虚拟地址都是已经确定好的了, 所以在ELF可执行文件被exec 装载时其各个段的虚拟基地址都是根据程序头表里面的值固定的;
exec的实现:(kern_exec.c):
int
sys_execve(struct thread *td, struct execve_args *uap)
{
struct image_args args;
struct vmspace *oldvmspace;
int error;
error = pre_execve(td, &oldvmspace); ----------------------------> 拿到原来的vmspace
if (error != 0)
return (error);
error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
uap->argv, uap->envv);
if (error == 0)
error = kern_execve(td, &args, NULL);
post_execve(td, error, oldvmspace); -----该函数内部调用的和内存相关的是----->vmspace_free(oldvmspace); ------------->从这里可以看出pre_execve 先提出vmspace, 然后在kern_execve里面应该是重新申请了新的vmspace, 所以才在post_execve里面释放老的的;
return (error);
}
static int
do_execve(td, args, mac_p)
struct thread *td;
struct image_args *args;
struct mac *mac_p;
{
。。。
error = exec_map_first_page(imgp); ------>申请一个物理页,然后调用 pager 拿到第一页内容,主要目的是拿到ELF header(因为header 里面有段表,符号表),imgp->image_header = (char )sf_buf_kva(imgp->firstpage); image_header 直接映射的虚拟地址
。。。
/
* Loop through the list of image activators, calling each one.
* An activator returns -1 if there is no match, 0 on success,
* and an error otherwise.
*/
for (i = 0; error == -1 && execsw[i]; ++i) {
if (execsw[i]->ex_imgact == NULL ||
execsw[i]->ex_imgact == img_first) {
continue;
}
error = (execsw[i]->ex_imgact)(imgp); -------------------->调用imgact_elf.c 中定义的—>_CONCAT(exec, __elfN(imgact))(struct image_params *imgp) <========> F1:
}
。。。
/
* Copy out strings (args and env) and initialize stack base
*/
if (p->p_sysent->sv_copyout_strings)
stack_base = (p->p_sysent->sv_copyout_strings)(imgp);
else
stack_base = exec_copyout_strings(imgp);
。。。
/
* Copy out strings (args and env) and initialize stack base
*/
if (p->p_sysent->sv_copyout_strings)
stack_base = (*p->p_sysent->sv_copyout_strings)(imgp);
else
stack_base = exec_copyout_strings(imgp);
/*
* If custom stack fixup routine present for this process
* let it do the stack setup.
* Else stuff argument count as first item on stack
*/
if (p->p_sysent->sv_fixup != NULL)
(*p->p_sysent->sv_fixup)(&stack_base, imgp);
else
suword(--stack_base, imgp->args->argc);
。。。
后面还有很多步骤,但是跟内存相关的在后面貌似没了
。。。
。。。
。。。
/* Set values passed into the program in registers. */ ----------->把参数传给新程序
if (p->p_sysent->sv_setregs)
(*p->p_sysent->sv_setregs)(td, imgp,
(u_long)(uintptr_t)stack_base);
else
exec_setregs(td, imgp, (u_long)(uintptr_t)stack_base);
。。。
。。。
}
上面看执行过kern_execve后原来的vmspace 释放是因为如下:
int
exec_new_vmspace(imgp, sv)
struct image_params *imgp;
struct sysentvec *sv;
{
int error;
struct proc *p = imgp->proc;
struct vmspace *vmspace = p->p_vmspace;
vm_object_t obj;
struct rlimit rlim_stack;
vm_offset_t sv_minuser, stack_addr;
vm_map_t map;
u_long ssiz;
imgp->vmspace_destroyed = 1;
imgp->sysent = sv;
/* May be called with Giant held */
EVENTHANDLER_INVOKE(process_exec, p, imgp);
/*
* Blow away entire process VM, if address space not shared,
* otherwise, create a new VM space so that other threads are
* not disrupted
*/
map = &vmspace->vm_map;
if (map_at_zero)
sv_minuser = sv->sv_minuser;
else
sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE);
if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv_minuser &&
vm_map_max(map) == sv->sv_maxuser) { ----------------------> address space not shared --------------->这里因为不是shared,所以这个空间是在fork1里面专门调用了vmspace_fork 来复制父进程的地址空间(或者是共享的,或者是写时复制的);
shmexit(vmspace);
pmap_remove_pages(vmspace_pmap(vmspace)); ------------------------>这里清页表同时释放物理页, 然后pmap->pm_stats.resident_count = 0
vm_map_remove(map, vm_map_min(map), vm_map_max(map)); ---------> 这里把所有的虚拟地址和物理地址(resident_count == 0 所以提前退出请物理页的流程)都干掉了, why ? 因为马上要执行新的程序了,原来的父进程的蓝本空间没有用了,所以全部释放掉
} else { =====================>共享进程空间的case
error = vmspace_exec(p, sv_minuser, sv->sv_maxuser); ---------------> 这里面会调用vmspace_alloc申请新的vmspace, 并且在curthread->td_pflags |= TDP_EXECVMSPC,提示post_execve 干掉原来老的vmspace;
if (error)
return (error);
vmspace = p->p_vmspace;
map = &vmspace->vm_map;
}
/* Map a shared page */ ------------------------->从这里以后使用的map, pmap 或是已经把页表清楚了
obj = sv->sv_shared_page_obj;
if (obj != NULL) {
vm_object_reference(obj);
error = vm_map_fixed(map, obj, 0,
sv->sv_shared_page_base, sv->sv_shared_page_len,
VM_PROT_READ | VM_PROT_EXECUTE,
VM_PROT_READ | VM_PROT_EXECUTE,
MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE);
if (error) {
vm_object_deallocate(obj);
return (error);
}
}
/* Allocate a new stack */
if (imgp->stack_sz != 0) {
ssiz = trunc_page(imgp->stack_sz);
PROC_LOCK(p);
lim_rlimit(p, RLIMIT_STACK, &rlim_stack);
PROC_UNLOCK(p);
if (ssiz > rlim_stack.rlim_max)
ssiz = rlim_stack.rlim_max;
if (ssiz > rlim_stack.rlim_cur) {
rlim_stack.rlim_cur = ssiz;
kern_setrlimit(curthread, RLIMIT_STACK, &rlim_stack);
}
} else if (sv->sv_maxssiz != NULL) {
ssiz = *sv->sv_maxssiz;
} else {
ssiz = maxssiz; ----------------->栈最大空间512 * 1024 * 1024;
}
stack_addr = sv->sv_usrstack - ssiz;
error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot :
sv->sv_stackprot,
VM_PROT_ALL, MAP_STACK_GROWS_DOWN); ===============> MAP_STACK_GROWS_UP 从高地址向下 grow;
if (error)
return (error);
#ifdef ia64
/* Allocate a new register stack */
error = vm_map_stack(map, IA64_BACKINGSTORE, (vm_size_t)ssiz,
sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_UP);
if (error)
return (error);
#endif
/* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the
* VM_STACK case, but they are still used to monitor the size of the
* process stack so we can check the stack rlimit.
*/
vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT; /*sgrowsiz 初始化为128 * 1024 , 所以vm_ssize 就是 32个pages*/
vmspace->vm_maxsaddr = (char *)stack_addr;
return (0);
}
imgact_elf.c :
F1:
static int
_CONCAT(exec, __elfN(imgact))(struct image_params *imgp)
{
const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header;
error = exec_new_vmspace(imgp, sv);
for (i = 0; i < hdr->e_phnum; i++) { --------------->
switch (phdr[i].p_type) {
case PT_LOAD: /* Loadable segment */ ---------------> 可执行程序的 需要加载的部分即 数据段 和 代码段;
if (phdr[i].p_memsz == 0)
break;
prot = __elfN(trans_prot)(phdr[i].p_flags);
error = __elfN(load_section)(imgp, phdr[i].p_offset, ------------------>重点
(caddr_t)(uintptr_t)phdr[i].p_vaddr + et_dyn_addr,
phdr[i].p_memsz, phdr[i].p_filesz, prot,
sv->sv_pagesize);
if (error != 0)
return (error);
/*
* If this segment contains the program headers,
* remember their virtual address for the AT_PHDR
* aux entry. Static binaries don't usually include
* a PT_PHDR entry.
*/
if (phdr[i].p_offset == 0 &&
hdr->e_phoff + hdr->e_phnum * hdr->e_phentsize
<= phdr[i].p_filesz)
proghdr = phdr[i].p_vaddr + hdr->e_phoff +
et_dyn_addr;
seg_addr = trunc_page(phdr[i].p_vaddr + et_dyn_addr);
seg_size = round_page(phdr[i].p_memsz +
phdr[i].p_vaddr + et_dyn_addr - seg_addr);
/*
* Make the largest executable segment the official
* text segment and all others data.
*
* Note that obreak() assumes that data_addr +
* data_size == end of data load area, and the ELF
* file format expects segments to be sorted by
* address. If multiple data segments exist, the
* last one will be used.
*/
if (phdr[i].p_flags & PF_X && text_size < seg_size) { ----------》数据段
text_size = seg_size;
text_addr = seg_addr;
} else { ----------》代码段
data_size = seg_size;
data_addr = seg_addr;
}
total_size += seg_size;
break;
case PT_PHDR: /* Program header table info */
proghdr = phdr[i].p_vaddr + et_dyn_addr;
break;
default:
break;
}
}
static int
__elfN(load_section)(struct image_params *imgp, vm_offset_t offset,
caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot,
size_t pagesize)
{
。。。
object = imgp->object;
map = &imgp->proc->p_vmspace->vm_map;
map_addr = trunc_page_ps((vm_offset_t)vmaddr, pagesize);
file_addr = trunc_page_ps(offset, pagesize);
/*
* We have two choices. We can either clear the data in the last page
* of an oversized mapping, or we can start the anon mapping a page
* early and copy the initialized data into that first page. We
* choose the second..
*/
if (memsz > filsz)
map_len = trunc_page_ps(offset + filsz, pagesize) - file_addr;
else
map_len = round_page_ps(offset + filsz, pagesize) - file_addr;
if (map_len != 0) {
/* cow flags: don't dump readonly sections in core */
cow = MAP_COPY_ON_WRITE | MAP_PREFAULT |
(prot & VM_PROT_WRITE ? 0 : MAP_DISABLE_COREDUMP);
rv = __elfN(map_insert)(map, -------------------> 应该是清空了的map,且此时没有任何的物理页关联的vm_map
object, --------------------> object
file_addr, /* file offset */ -------------------> offset to file object
map_addr, /* virtual start */ -----------------> 可执行程序在编译过后由编译器指定 数据段,代码段的 虚拟地址
map_addr + map_len,/* virtual end */ -----------------> 一个段所占用的虚拟地址大小,即一个段需要多大的空间;
prot, =====================>段的保护属性,应该是从ELF header的段表里面定义的(如数据段私有, 代码段share)
cow); =========================> COW 属性
if (rv != KERN_SUCCESS)
return (EINVAL);
/* we can stop now if we've covered it all */
if (memsz == filsz) {
return (0);
}
}
。。。
/*
* set it to the specified protection.
* XXX had better undo the damage from pasting over the cracks here!
*/
vm_map_protect(map, trunc_page(map_addr), round_page(map_addr + ---------------------> 设置这个段的属性, 如代码段 share, 数据段 need_write_copy;
map_len), prot, FALSE);
}
static int
__elfN(map_insert)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
vm_offset_t start, vm_offset_t end, vm_prot_t prot, int cow)
{
struct sf_buf *sf;
vm_offset_t off;
vm_size_t sz;
int error, rv;
if (start != trunc_page(start)) {
rv = __elfN(map_partial)(map, object, offset, start,
round_page(start), prot);
if (rv)
return (rv);
offset += round_page(start) - start;
start = round_page(start);
}
if (end != round_page(end)) {
rv = __elfN(map_partial)(map, object, offset +
trunc_page(end) - start, trunc_page(end), end, prot);
if (rv)
return (rv);
end = trunc_page(end);
}
if (end > start) {
if (offset & PAGE_MASK) {
/*
* The mapping is not page aligned. This means we have
* to copy the data. Sigh.
*/
rv = vm_map_find(map, NULL, 0, &start, end - start, 0, ------------------------->先找空间
VMFS_NO_SPACE, prot | VM_PROT_WRITE, VM_PROT_ALL,
0);
if (rv)
return (rv);
if (object == NULL)
return (KERN_SUCCESS);
for (; start < end; start += sz) {
sf = vm_imgact_map_page(object, offset);
if (sf == NULL)
return (KERN_FAILURE);
off = offset - trunc_page(offset);
sz = end - start;
if (sz > PAGE_SIZE - off)
sz = PAGE_SIZE - off;
error = copyout((caddr_t)sf_buf_kva(sf) + off,
(caddr_t)start, sz);
vm_imgact_unmap_page(sf);
if (error) {
return (KERN_FAILURE);
}
offset += sz;
}
rv = KERN_SUCCESS;
} else {
vm_object_reference(object);
vm_map_lock(map);
rv = vm_map_insert(map, object, offset, start, end,
prot, VM_PROT_ALL, cow);
vm_map_unlock(map);
if (rv != KERN_SUCCESS)
vm_object_deallocate(object);
}
return (rv);
} else {
return (KERN_SUCCESS);
}
}
static int
__elfN(map_partial)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
vm_offset_t start, vm_offset_t end, vm_prot_t prot)
{
struct sf_buf *sf;
int error;
vm_offset_t off;
/*
* Create the page if it doesn't exist yet. Ignore errors.
*/
vm_map_lock(map);
vm_map_insert(map, NULL, 0, trunc_page(start), round_page(end), ------------------>虚拟地址插入到进程的虚拟地址空间;
VM_PROT_ALL, VM_PROT_ALL, 0);
vm_map_unlock(map);
/*
* Find the page from the underlying object.
*/
if (object) {
sf = vm_imgact_map_page(object, offset); ---------------->调用 vm_imgact_hold_page 见下面, 主要是申请物理页,然后调用调页器读取内容到该物理页;这时候虚拟地址空间和物理地址没有映射;
if (sf == NULL)
return (KERN_FAILURE);
off = offset - trunc_page(offset);
error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)start, ----------->这里访问虚拟地址会触发page_fault,然后直接从object 里面找到vm_imgact_map_page 读进来的page,直接进行虚-物地址映射;通过这种方式一个可执行文件的代码段和数据段就读入到内存里面了;
end - start);
vm_imgact_unmap_page(sf);
if (error) {
return (KERN_FAILURE);
}
}
return (KERN_SUCCESS);
}
static vm_page_t
vm_imgact_hold_page(vm_object_t object, vm_ooffset_t offset)
{
vm_page_t m, ma[1];
vm_pindex_t pindex;
int rv;
VM_OBJECT_WLOCK(object);
pindex = OFF_TO_IDX(offset);
m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL);
if (m->valid != VM_PAGE_BITS_ALL) {
ma[0] = m;
rv = vm_pager_get_pages(object, ma, 1, 0);
m = vm_page_lookup(object, pindex);
if (m == NULL)
goto out;
if (rv != VM_PAGER_OK) {
vm_page_lock(m);
vm_page_free(m);
vm_page_unlock(m);
m = NULL;
goto out;
}
}
vm_page_xunbusy(m);
vm_page_lock(m);
vm_page_hold(m);
vm_page_activate(m);
vm_page_unlock(m);
out:
VM_OBJECT_WUNLOCK(object);
return (m);
}
ELF 文件的数据结构和介绍:
http://blog.csdn.net/b02042236/article/details/6064106
Linux 下加载ELF的流程:
http://blog.csdn.net/fivedoumi/article/details/53262160
static int load_elf_binary(struct linux_binprm *bprm){
// …
struct pt_regs *regs = current_pt_regs(); // 获取当前进程的寄存器存储位置
// 获取elf前128个字节,作为魔数
loc->elf_ex = *((struct elfhdr *)bprm->buf);
// 检查魔数是否匹配
if (memcmp(loc->elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
goto out;
// 如果既不是可执行文件也不是动态链接程序,就错误退出
if (loc->elf_ex.e_type != ET_EXEC && loc->elf_ex.e_type != ET_DYN)
//
// 读取所有的头部信息
// 读入程序的头部分
retval = kernel_read(bprm->file, loc->elf_ex.e_phoff, --------------------> 读取头
(char *)elf_phdata, size);
// 遍历elf的程序头
for (i = 0; i < loc->elf_ex.e_phnum; i++) {
// 如果存在解释器头部
if (elf_ppnt->p_type == PT_INTERP) {
//
// 读入解释器名
retval = kernel_read(bprm->file, elf_ppnt->p_offset,
elf_interpreter,
elf_ppnt->p_filesz);
// 打开解释器文件
interpreter = open_exec(elf_interpreter);
// 读入解释器文件的头部
retval = kernel_read(interpreter, 0, bprm->buf,
BINPRM_BUF_SIZE);
// 获取解释器的头部
loc->interp_elf_ex = *((struct elfhdr *)bprm->buf);
break;
}
elf_ppnt++;
}
// 释放空间、删除信号、关闭带有CLOSE_ON_EXEC标志的文件
retval = flush_old_exec(bprm);
setup_new_exec(bprm); --------------->这里应该是申请新的空间
// 为进程分配用户态堆栈,并塞入参数和环境变量
retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
executable_stack);
current->mm->start_stack = bprm->p;
// 将elf文件映射进内存
for(i = 0, elf_ppnt = elf_phdata;
i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
if (unlikely (elf_brk > elf_bss)) {
unsigned long nbyte;
// 生成BSS
retval = set_brk(elf_bss + load_bias,
elf_brk + load_bias);
// ...
}
// 可执行程序
if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) {
elf_flags |= MAP_FIXED;
} else if (loc->elf_ex.e_type == ET_DYN) { // 动态链接库
// ...
}
// 创建一个新线性区对可执行文件的数据段进行映射
error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt,
elf_prot, elf_flags, 0);
}
}
// 加上偏移量
loc->elf_ex.e_entry += load_bias;
// ....
// 创建一个新的匿名线性区,来映射程序的bss段
retval = set_brk(elf_bss, elf_brk);
// 如果是动态链接
if (elf_interpreter) {
unsigned long interp_map_addr = 0;
// 调用一个装入动态链接程序的函数 此时elf_entry指向一个动态链接程序的入口
elf_entry = load_elf_interp(&loc->interp_elf_ex,
interpreter,
&interp_map_addr,
load_bias);
// ...
} else {
// elf_entry是可执行程序的入口
elf_entry = loc->elf_ex.e_entry;
// ....
}
// 修改保存在内核堆栈,但属于用户态的eip和esp
start_thread(regs, elf_entry, bprm->p);
retval = 0;
// }
汇编语言代码完成它的工作之后, 调用的第一个用C语言写的内核流程就是:
/*
- System startup; initialize the world, create process 0, mount root
- filesystem, and fork to create init and pagedaemon. Most of the
- hard work is done in the lower-level initialization routines including
- startup(), which does memory initialization and autoconfiguration.
- This allows simple addition of new kernel subsystems that require
- boot time initialization. It also allows substitution of subsystem
- (for instance, a scheduler, kernel profiler, or VM system) by object
- module. Finally, it allows for optional “kernel threads”.
*/
void
mi_startup(void)
{
register struct sysinit **sipp; /* system initialization*/
register struct sysinit **xipp; /* interior loop of sort*/
register struct sysinit *save; /* bubble*/
#if defined(VERBOSE_SYSINIT)
int last;
int verbose;
#endif
if (boothowto & RB_VERBOSE)
bootverbose++;
if (sysinit == NULL) {
sysinit = SET_BEGIN(sysinit_set);
sysinit_end = SET_LIMIT(sysinit_set);
}
restart:
/*
* Perform a bubble sort of the system initialization objects by
* their subsystem (primary key) and order (secondary key).
*/
for (sipp = sysinit; sipp < sysinit_end; sipp++) { ------------------>排除所有调用顺序
for (xipp = sipp + 1; xipp < sysinit_end; xipp++) {
if ((*sipp)->subsystem < (*xipp)->subsystem ||
((*sipp)->subsystem == (*xipp)->subsystem &&
(sipp)->order <= (xipp)->order))
continue; / skip/
save = *sipp;
*sipp = *xipp;
*xipp = save;
}
}
#if defined(VERBOSE_SYSINIT)
last = SI_SUB_COPYRIGHT;
verbose = 0;
#if !defined(DDB)
printf(“VERBOSE_SYSINIT: DDB not enabled, symbol lookups disabled.\n”);
#endif
#endif
/*
* Traverse the (now) ordered list of system initialization tasks.
* Perform each task, and continue on to the next task.
*/
for (sipp = sysinit; sipp < sysinit_end; sipp++) { ------>调用所有子系统
if ((*sipp)->subsystem == SI_SUB_DUMMY)
continue; /* skip dummy task(s)*/
if ((*sipp)->subsystem == SI_SUB_DONE)
continue;
#if defined(VERBOSE_SYSINIT)
if ((*sipp)->subsystem > last) {
verbose = 1;
last = (*sipp)->subsystem;
printf(“subsystem %x\n”, last);
}
if (verbose) {
#if defined(DDB)
const char *func, *data;
func = symbol_name((vm_offset_t)(*sipp)->func,
DB_STGY_PROC);
data = symbol_name((vm_offset_t)(*sipp)->udata,
DB_STGY_ANY);
if (func != NULL && data != NULL)
printf(" %s(&%s)... ", func, data);
else if (func != NULL)
printf(" %s(%p)... ", func, (*sipp)->udata);
else
#endif
printf(" %p(%p)… ", (*sipp)->func,
(*sipp)->udata);
}
#endif
/* Call function */
(*((*sipp)->func))((*sipp)->udata); ----------------->执行具体的函数;
#if defined(VERBOSE_SYSINIT)
if (verbose)
printf(“done.\n”);
#endif
/* Check off the one we're just done */
(*sipp)->subsystem = SI_SUB_DONE;
/* Check if we've installed more sysinit items via KLD */
if (newsysinit != NULL) {
if (sysinit != SET_BEGIN(sysinit_set))
free(sysinit, M_TEMP);
sysinit = newsysinit;
sysinit_end = newsysinit_end;
newsysinit = NULL;
newsysinit_end = NULL;
goto restart;
}
}
mtx_assert(&Giant, MA_OWNED | MA_NOTRECURSED);
mtx_unlock(&Giant);
/*
* Now hand over this thread to swapper. ----------------> 给这个线程交给swapper
*/
swapper(); ----------------------------------->见 下面截图的14.4.2 文字描述, 在汇编代码里面已经开始创建了0号进程了,而mi_startup是从汇编代码直接调用的,所以在执行mi_startup的进程上下文其实已经是0号进程的上下文了,所以这里直接转调swapper()就意味着0号进程开始执行了
当在swapper里面没有任务的时候tsleep(&proc0, PVM, "swapin", MAXSLP * hz / 2); 睡在proc0上,当内核流程里面需要换入进程的时候就会调用kick_proc0 其实就是调用wakeup(&proc0); 而swapper()这个函数是永远不会退出的, 也就是说swapper()这个函数就是跑在了0号进程内;
/* NOTREACHED*/
}
mi_startup 启动的内核服务按先后启动的类型如下:
插播一个FreeBSD 设计与实现第二版关于内心线程的初始化:
进程0:
static void
proc0_init(void *dummy __unused)
{
struct proc *p;
struct thread *td;
vm_paddr_t pageablemem;
int i;
GIANT_REQUIRED;
p = &proc0; 《---------
td = &thread0; 《----------
/*
* Initialize thread and process structures.
*/
procinit(); /* set up proc zone */
threadinit(); /* set up UMA zones */
/*
* Initialise scheduler resources.
* Add scheduler specific parts to proc, thread as needed.
*/
schedinit(); /* scheduler gets its house in order */
。。
p->p_sysent = &null_sysvec;
p->p_flag = P_SYSTEM | P_INMEM;
。。
p->p_nice = NZERO;
/* pid_max cannot be greater than PID_MAX /
td->td_tid = PID_MAX + 1;
LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash);
td->td_state = TDS_RUNNING;
td->td_pri_class = PRI_TIMESHARE; -------------> why 是timeshare的 ?
td->td_user_pri = PUSER;
td->td_base_user_pri = PUSER;
td->td_lend_user_pri = PRI_MAX;
td->td_priority = PVM;
td->td_base_pri = PVM;
td->td_oncpu = 0;
td->td_flags = TDF_INMEM;
td->td_pflags = TDP_KTHREAD;
td->td_cpuset = cpuset_thread0();
。。
/ Allocate a prototype map so we have something to fork. */
pmap_pinit0(vmspace_pmap(&vmspace0)); ----------->全局的变量vmspace0;
p->p_vmspace = &vmspace0;
vmspace0.vm_refcnt = 1;
/*
* proc0 is not expected to enter usermode, so there is no special
* handling for sv_minuser here, like is done for exec_new_vmspace().
*/
vm_map_init(&vmspace0.vm_map, vmspace_pmap(&vmspace0), ------------->初始化虚拟空间的大小;
p->p_sysent->sv_minuser, p->p_sysent->sv_maxuser);
。。
}
进程1和进程0的关系体现:
static void
create_init(const void *udata __unused)
{
struct ucred *newcred, *oldcred;
int error;
error = fork1(&thread0, RFFDG | RFPROC | RFSTOPPED, 0, &initproc, ------------------>创建一个进程1,并有自己的内存空间; 根据 进程0的内存空间 和 thread0的调度参数来创建进程1;
NULL, 0);
if (error)
panic("cannot fork init: %d\n", error);
KASSERT(initproc->p_pid == 1, ("create_init: initproc->p_pid != 1"));
/* divorce init's credentials from the kernel's */
newcred = crget();
sx_xlock(&proctree_lock);
PROC_LOCK(initproc);
initproc->p_flag |= P_SYSTEM | P_INMEM;
initproc->p_treeflag |= P_TREE_REAPER;
LIST_INSERT_HEAD(&initproc->p_reaplist, &proc0, p_reapsibling); ---------->是0号进程的后代;
oldcred = initproc->p_ucred;
crcopy(newcred, oldcred);
#ifdef MAC
mac_cred_create_init(newcred);
#endif
#ifdef AUDIT
audit_cred_proc1(newcred);
#endif
initproc->p_ucred = newcred;
PROC_UNLOCK(initproc);
sx_xunlock(&proctree_lock);
crfree(oldcred);
cred_update_thread(FIRST_THREAD_IN_PROC(initproc));
cpu_set_fork_handler(FIRST_THREAD_IN_PROC(initproc), start_init, NULL); ------------->此时线程没有被调度执行, 稍后kick_init 会把线程放到可执行队列,当执行的时候就执行start_init;
}
SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL);
/*
-
Make it runnable now.
*/
static void
kick_init(const void *udata __unused)
{
struct thread *td;td = FIRST_THREAD_IN_PROC(initproc);
thread_lock(td);
TD_SET_CAN_RUN(td);
sched_add(td, SRQ_BORING);
thread_unlock(td);
}
SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kick_init, NULL); ----------->通过这种方式让start_init开始执行;
/*
-
Start the initial user process; try exec’ing each pathname in init_path.
-
The program is invoked with one argument containing the boot flags.
*/
static void
start_init(void *dummy) ----------------> 执行 /sbin/init
{
vm_offset_t addr;
struct execve_args args;
int options, error;
char *var, *path, *next, *s;
char *ucp, **uap, *arg0, *arg1;
struct thread *td;
struct proc *p;mtx_lock(&Giant);
GIANT_REQUIRED;
td = curthread;
p = td->td_proc;vfs_mountroot();
/* Wipe GELI passphrase from the environment. */
unsetenv(“kern.geom.eli.passphrase”);/*
- Need just enough stack to hold the faked-up “execve()” arguments.
*/
addr = p->p_sysent->sv_usrstack - PAGE_SIZE;
if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE, 0,
VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0)
panic(“init: couldn’t allocate argument space”);
p->p_vmspace->vm_maxsaddr = (caddr_t)addr;
p->p_vmspace->vm_ssize = 1;
if ((var = getenv(“init_path”)) != NULL) {
strlcpy(init_path, var, sizeof(init_path));
freeenv(var);
}for (path = init_path; *path != ‘\0’; path = next) {
while (*path == ‘:’)
path++;
if (*path == ‘\0’)
break;
for (next = path; *next != ‘\0’ && next != ‘:’; next++)
/ nothing */ ;
if (bootverbose)
printf(“start_init: trying %.*s\n”, (int)(next - path),
path);/* * Move out the boot flag argument. */ options = 0; ucp = (char *)p->p_sysent->sv_usrstack; (void)subyte(--ucp, 0); /* trailing zero */ if (boothowto & RB_SINGLE) { (void)subyte(--ucp, 's'); options = 1; }
- Need just enough stack to hold the faked-up “execve()” arguments.
#ifdef notyet
if (boothowto & RB_FASTBOOT) {
(void)subyte(–ucp, ‘f’);
options = 1;
}
#endif
#ifdef BOOTCDROM
(void)subyte(–ucp, ‘C’);
options = 1;
#endif
if (options == 0)
(void)subyte(--ucp, '-');
(void)subyte(--ucp, '-'); /* leading hyphen */
arg1 = ucp;
/*
* Move out the file name (also arg 0).
*/
(void)subyte(--ucp, 0);
for (s = next - 1; s >= path; s--)
(void)subyte(--ucp, *s);
arg0 = ucp;
/*
* Move out the arg pointers.
*/
uap = (char **)((intptr_t)ucp & ~(sizeof(intptr_t)-1));
(void)suword((caddr_t)--uap, (long)0); /* terminator */
(void)suword((caddr_t)--uap, (long)(intptr_t)arg1);
(void)suword((caddr_t)--uap, (long)(intptr_t)arg0);
/*
* Point at the arguments.
*/
args.fname = arg0;
args.argv = uap;
args.envv = NULL;
/*
* Now try to exec the program. If can't for any reason
* other than it doesn't exist, complain.
*
* Otherwise, return via fork_trampoline() all the way
* to user mode as init!
*/
if ((error = sys_execve(td, &args)) == 0) { --------------> 执行exec 启动用户态的程序,也就是说所有的用户态进程都是由init 1号进程来创建的;
mtx_unlock(&Giant);
return;
}
if (error != ENOENT)
printf("exec %.*s: error %d\n", (int)(next - path),
path, error);
}
printf("init: not found in path %s\n", init_path);
panic("no init");
}