复制寄存器
上篇说到copy_process创建进程的时候,会调用copy_thread来复制一些寄存器;内核线程的话,X19存放线程函数的地址,X20存放线程函数的参数;用户线程主要时保存了用户模式的各种寄存器状态,在内核切回用户的时候,便于恢复正常状态
int copy_thread(unsigned long clone_flags, unsigned long stack_start,
unsigned long stk_sz, struct task_struct *p)
{
struct pt_regs *childregs = task_pt_regs(p);
memset(&p->thread.cpu_context, 0, sizeof(struct cpu_context));
/*
* In case p was allocated the same task_struct pointer as some
* other recently-exited task, make sure p is disassociated from
* any cpu that may have run that now-exited task recently.
* Otherwise we could erroneously skip reloading the FPSIMD
* registers for p.
*/
fpsimd_flush_task_state(p);
if (likely(!(p->flags & PF_KTHREAD))) { //用户进程
*childregs = *current_pt_regs(); //把用户模式的各种寄存器保存在内核栈底部的pt_regs结构中
childregs->regs[0] = 0; //子进程的X0寄存器置0,这个存放系统调用的返回值,子进程返回0
/*
* Read the current TLS pointer from tpidr_el0 as it may be
* out-of-sync with the saved value.
*/
asm("mrs %0, tpidr_el0" : "=r" (*task_user_tls(p)));
if (stack_start) {
if (is_compat_thread(task_thread_info(p)))
childregs->compat_sp = stack_start;
/* 16-byte aligned stack mandatory on AArch64 */
else if (stack_start & 15)
return -EINVAL;
else
childregs->sp = stack_start;
}
/*
* If a TLS pointer was passed to clone (4th argument), use it
* for the new thread.
*/
if (clone_flags & CLONE_SETTLS)
p->thread.tp_value = childregs->regs[3];
} else {//内核进程
memset(childregs, 0, sizeof(struct pt_regs));
childregs->pstate = PSR_MODE_EL1h;
p->thread.cpu_context.x19 = stack_start;
p->thread.cpu_context.x20 = stk_sz;
}
p->thread.cpu_context.pc = (unsigned long)ret_from_fork;
p->thread.cpu_context.sp = (unsigned long)childregs;
ptrace_hw_copy_thread(p);
return 0;
}
thread_struct
struct thread_struct是CPU-specific state of this task,其中x19-x28是arm64架构规定cpu上下文切换需要保存的寄存器
struct thread_struct {
struct cpu_context cpu_context; /* cpu context */
unsigned long tp_value; /* TLS register */
#ifdef CONFIG_COMPAT
unsigned long tp2_value;
#endif
struct fpsimd_state fpsimd_state;
unsigned long fault_address; /* fault info */
unsigned long fault_code; /* ESR_EL1 value */
struct debug_info debug; /* debugging */
};
struct cpu_context {
unsigned long x19;
unsigned long x20;
unsigned long x21;
unsigned long x22;
unsigned long x23;
unsigned long x24;
unsigned long x25;
unsigned long x26;
unsigned long x27;
unsigned long x28;
unsigned long fp;
unsigned long sp;
unsigned long pc;
};
唤醒进程
然后唤醒新进程,主要是设置进程状态,将进程插入运行队列,再判断是否可以抢占当前进程,不行的就只能等调度了
/*
* wake_up_new_task - wake up a newly created task for the first time.
*
* This function will do some initial scheduler statistics housekeeping
* that must be done for every newly created context, then puts the task
* on the runqueue and wakes it.
*/
void wake_up_new_task(struct task_struct *p)
{
struct rq_flags rf;
struct rq *rq;
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
p->state = TASK_RUNNING; //进程状态设置为TASK_RUNNING
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
* - cpus_allowed can change in the fork path
* - any previously selected CPU might disappear through hotplug
*
* Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
* as we're not fully set-up yet.
*/
p->recent_used_cpu = task_cpu(p);
__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); //调用调度类的migrate_task_rq做负载均衡,选择负载最轻的处理器
#endif
rq = __task_rq_lock(p, &rf); //给运行队列上锁
update_rq_clock(rq);
post_init_entity_util_avg(&p->se);
activate_task(rq, p, ENQUEUE_NOCLOCK); //将新进程插入运行队列
p->on_rq = TASK_ON_RQ_QUEUED;
trace_sched_wakeup_new(p);
check_preempt_curr(rq, p, WF_FORK); //检查新进程是否可以抢占当前进程
#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
/*
* Nothing relies on rq->lock after this, so its fine to
* drop it.
*/
rq_unpin_lock(rq, &rf);
p->sched_class->task_woken(rq, p); //调用调度类的task_woken
rq_repin_lock(rq, &rf);
}
#endif
task_rq_unlock(rq, p, &rf); //释放运行队列的锁
}
ret_from_fork
copy_thread中的p->thread.cpu_context.pc = (unsigned long)ret_from_fork;得出进程第一次执行的pc就是ret_from_fork,也就是不管是直接执行,还是被调度;新进程首先执行这个,当然,已经执行过了的话,这个pc,就是被打断时的PC了
/*
* This is how we return from a fork.
*/
ENTRY(ret_from_fork)
bl schedule_tail //清理工作
cbz x19, 1f
mov x0, x20
blr x19 //不是内核线程就执行x19存储的线程函数
1: get_thread_info tsk //将sp_el0放入x28寄存器
b ret_to_user //返回用户模式
ENDPROC(ret_from_fork)
获取thread_info
get_thread_info,rd用于将系统级特权级(EL)为EL0的栈指针(SP_EL0)的值加载到目标寄存器(\rd)中
/*
* Return the current thread_info.
*/
.macro get_thread_info, rd
mrs \rd, sp_el0
.endm
//比如x28寄存器对应的别名为tsk
sc_nr .req x25 // number of system calls
scno .req x26 // syscall number
stbl .req x27 // syscall table pointer
tsk .req x28 // current thread_info
寄存器保存
那么sp_el0的值又是在那里赋值的?上下文切换时会调用cpu_switch_to(prev, next),参数next就是下一个进程的进程描述符,也就是第二个参数x1,且被cpu_switch_to里赋值给了sp_el0了
处理器状态切换时将前一个进程的x19-x28,fp,sp,pc 保存到了进程描述符的cpu_contex中,然后将即将执行的进程描述符的cpu_contex的x19-x28,fp,sp,pc 恢复到相应寄存器中,而且将执行进程的进程描述符task_struct地址存放在sp_el0中,用于通过current找到当前进程,这样就完成了处理器的状态切换
/*
* Register switch for AArch64. The callee-saved registers need to be saved
* and restored. On entry:
* x0 = previous task_struct (must be preserved across the switch)
* x1 = next task_struct
* Previous and next are guaranteed not to be the same.
*
*/
ENTRY(cpu_switch_to)
mov x10, #THREAD_CPU_CONTEXT
add x8, x0, x10
mov x9, sp
stp x19, x20, [x8], #16 // store callee-saved registers
stp x21, x22, [x8], #16
stp x23, x24, [x8], #16
stp x25, x26, [x8], #16
stp x27, x28, [x8], #16
stp x29, x9, [x8], #16
str lr, [x8]
add x8, x1, x10
ldp x19, x20, [x8], #16 // restore callee-saved registers
ldp x21, x22, [x8], #16
ldp x23, x24, [x8], #16
ldp x25, x26, [x8], #16
ldp x27, x28, [x8], #16
ldp x29, x9, [x8], #16
ldr lr, [x8]
mov sp, x9
#ifdef CONFIG_THREAD_INFO_IN_TASK
msr sp_el0, x1
#else
and x9, x9, #~(THREAD_SIZE - 1)
msr sp_el0, x9
#endif
ret
ENDPROC(cpu_switch_to)
装载程序
在用户空间,调用fork后,会陷入内核后,会从ret_from_fork返回用户模式,就会调用execve来装载要运行的程序
execve/execveat系统和调用
do_execveat_common
do_open_execat //打开可执行文件
sched_exec //执行一次负载均衡
bprm_mm_init //创建新的进程描述符
mm_alloc
__bprm_mm_init
prepare_binprm //设置进程证书,读文件的前128bytes到缓冲区
copy_strings //把文件名称,环境字符串,参数字符串压到用户栈
exec_binprm //寻找注册过的二进制格式的处理程序,识别到即调用struct linux_binfmt->load_binary来加载程序
search_binary_handler
装载elf程序
elf是一个二进制的格式,他也会把这个elf_binfmt注册到内核中,装载的是elf程序,就会匹配到这个linux_binfmt,并调用其中的成员load_binary
static struct linux_binfmt elf_format = {
.module = THIS_MODULE,
.load_binary = load_elf_binary,
.load_shlib = load_elf_library,
.core_dump = elf_core_dump,
.min_coredump = ELF_EXEC_PAGESIZE,
};
对应的就是load_elf_binary,其框架如下
最后就会调用start_thread,来设置结构体pt_regs中的pc为程序入口,sp为用户栈指针,返回到用户模式,就会按照pc,sp来正常执行程序了
static inline void start_thread_common(struct pt_regs *regs, unsigned long pc)
{
memset(regs, 0, sizeof(*regs));
forget_syscall(regs);
regs->pc = pc;
}
static inline void start_thread(struct pt_regs *regs, unsigned long pc,
unsigned long sp)
{
start_thread_common(regs, pc);
regs->pstate = PSR_MODE_EL0t;
if (arm64_get_ssbd_state() != ARM64_SSBD_FORCE_ENABLE)
regs->pstate |= PSR_SSBS_BIT;
regs->sp = sp;
}
装载总结
1.创建一个独立的虚拟地址空间:此处的创建虚拟地址空间并不是真正的空间,而是创建映射函数所需要的相应的数据结构,比如页表,它存着虚拟空间和物理空间的联系。
2.读取可执行文件头并且建立虚拟空间与可执行文件的映射关系:当程序发生页错误时,操作系统 将从物理内存中分配一个物理页,然后将该”缺页“从磁盘中读取到内存中,再设置缺页的虚拟页和物理页的映射关系,这样程序才得以正常运行。设置缺页的虚拟页和物理页的映射关系(数据结构)是装载的核心。
3.将CPU的指令寄存器设置成可执行文件的入口地址,启动运行。
进程状态
进程从其实到结束,中间会有多个状态;对于linux的的进程状态,虽然定义了不少,很多是中间状态,和一些状态的组合
/* Used in tsk->state: */
#define TASK_RUNNING 0x0000
#define TASK_INTERRUPTIBLE 0x0001
#define TASK_UNINTERRUPTIBLE 0x0002
#define __TASK_STOPPED 0x0004
#define __TASK_TRACED 0x0008
/* Used in tsk->exit_state: */
#define EXIT_DEAD 0x0010
#define EXIT_ZOMBIE 0x0020
#define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD)
/* Used in tsk->state again: */
#define TASK_PARKED 0x0040
#define TASK_DEAD 0x0080
#define TASK_WAKEKILL 0x0100
#define TASK_WAKING 0x0200
#define TASK_NOLOAD 0x0400
#define TASK_NEW 0x0800
#define TASK_STATE_MAX 0x1000
/* Convenience macros for the sake of set_current_state: */
#define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
#define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED)
#define TASK_TRACED (TASK_WAKEKILL | __TASK_TRACED)
#define TASK_IDLE (TASK_UNINTERRUPTIBLE | TASK_NOLOAD)
/* Convenience macros for the sake of wake_up(): */
#define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
/* get_task_state(): */
#define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \
TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
__TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \
TASK_PARKED)
我们一般就关注这几个状态就行
几个重要的状态的转换流程如下