当然还是从fork和vfork系统调用说起。
fork,vfork,clone系统调用及内核函数kernel_thread都是调用_do_fork函数的。
fork:
_do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
vfork:要保证子进程先运行,CLONE_VFORK保证。vfork为exec而生,vofrk使用中只能exit而不能return。因为和父进程共享栈空间。
_do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
0, NULL, NULL, 0);
clone:
_do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls)
clone的flags是根据需要传进来的,且会传进来堆栈地址,用户堆栈,内核栈固定为4K/8K(一页或两页)
kernel_thread:
pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
(unsigned long)arg, NULL, NULL, 0);
}
flag说明:
SIGCHLD:创建子进程。fork和vfork都有。这是个信号值,当子进程退出时也会向父进程发送该信号。
CLONE_VM:共享虚拟空间,这是线程和进程的区别,进程有自己独立的地址空间,而线程没有。vfork,clone和kernel_thread都没有独立的地址空间,也就是说vfork和clone创建的都是线程,而kernel_thread创建的是内核线程,只有内核空间没有用户空间,而内核空间不仅你内核线程共享,是所有进/线程都是共享的。
CLONE_VFORK :
看看定义中的注释
#define CLONE_VFORK 0x00004000
/* set if the parent wants the child to wake it up on mm_release */
翻译:如果父进程想要子进程在mm_release时唤醒它(时使用),呃。。talk is cheap, show me the “_do_fork” code吧
CLONE_UNTRACED:
#define CLONE_UNTRACED 0x00800000
/* set if the tracing process can't force CLONE_PTRACE on this clone */
一样看不懂…话不多说,进入_do_fork!
_do_fork
思维导图:
读代码
1.接口:
long _do_fork(unsigned long clone_flags,
unsigned long stack_start,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr,
unsigned long tls)
接口参数有:clone_flags; 栈信息,父进程线程组id,子进程程组id。这个栈信息,回想freertos/ucos中创建task,需要自己开辟栈空间,而我们在linux中开发调用pthread_create时并不需要开辟stack啊,这是因为glibc已经帮我们做好了。
ptrace事件注册
上面看到内核线程创建是不需要这个信息的,这个是用来调式追踪,父进程追踪子进程,发送信号。
if (!(clone_flags & CLONE_UNTRACED)) {
if (clone_flags & CLONE_VFORK)
trace = PTRACE_EVENT_VFORK;
else if ((clone_flags & CSIGNAL) != SIGCHLD)
trace = PTRACE_EVENT_CLONE;
else
trace = PTRACE_EVENT_FORK;
if (likely(!ptrace_event_enabled(current, trace)))
trace = 0;
}
copy_process
看思维导图;
步骤:
- 检查flag冲突
- 有信号处理则退出
- 复制task_struct;dup_task_struct
- sched_fork()入调度树
- copy…creds,fs,files,sighand,mm,namespace,io,thread_tls
- init_sigpending(&p->pending);初始化信号
- cgroup_fork;cgropu子系统fork
_do_fork中调用
p = copy_process(clone_flags, stack_start, stack_size,
child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
检查flag冲突
/*
* Don't allow sharing the root directory with processes in a different
* namespace
*/
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
return ERR_PTR(-EINVAL);
/*
* Thread groups must share signals as well, and detached threads
* can only be started up within the thread group.
*/
if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
return ERR_PTR(-EINVAL);
/*
* Shared signal handlers imply shared VM. By way of the above,
* thread groups also imply shared VM. Blocking this case allows
* for various simplifications in other code.
*/
if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
return ERR_PTR(-EINVAL);
/*
* Siblings of global init remain as zombies on exit since they are
* not reaped by their parent (swapper). To solve this and to avoid
* multi-rooted process trees, prevent global and container-inits
* from creating siblings.
*/
if ((clone_flags & CLONE_PARENT) &&
current->signal->flags & SIGNAL_UNKILLABLE)
return ERR_PTR(-EINVAL);
/*
* If the new process will be in a different pid or user namespace
* do not allow it to share a thread group with the forking task.
*/
if (clone_flags & CLONE_THREAD) {
if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
(task_active_pid_ns(current) !=
current->nsproxy->pid_ns_for_children))
return ERR_PTR(-EINVAL);
}
!CLONE_SIGHAND和CLONE_THREAD;创建线程和父进程共享信号处理。
CLONE_SIGHAND和!CLONE_VM;共享信号处理就要共享虚拟地址空间。
有信号挂起待处理则退出
if (signal_pending(current))
goto fork_out;
dup_task_struct
1.alloc_task_struct_node,分配task_struct, slab分配器,高速专业缓存
static inline struct task_struct *alloc_task_struct_node(int node)
{
return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
}
- alloc_thread_stack_node,分配栈空间,如果定义vmap则在cache_stacks[]数组中找空闲;否则还是在slab专业缓存中找。
# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
{
#ifdef CONFIG_VMAP_STACK
void *stack;
int i;
for (i = 0; i < NR_CACHED_STACKS; i++) {
struct vm_struct *s;
s = this_cpu_xchg(cached_stacks[i], NULL);
if (!s)
continue;
/* Clear stale pointers from reused stack. */
memset(s->addr, 0, THREAD_SIZE);
tsk->stack_vm_area = s;
tsk->stack = s->addr;
return s->addr;
}
stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
VMALLOC_START, VMALLOC_END,
THREADINFO_GFP,
PAGE_KERNEL,
0, node, __builtin_return_address(0));
/*
* We can't call find_vm_area() in interrupt context, and
* free_thread_stack() can be called in interrupt context,
* so cache the vm_struct.
*/
if (stack) {
tsk->stack_vm_area = find_vm_area(stack);
tsk->stack = stack;
}
return stack;
#else
struct page *page = alloc_pages_node(node, THREADINFO_GFP,
THREAD_SIZE_ORDER);
if (likely(page)) {
tsk->stack = page_address(page);
return tsk->stack;
}
return NULL;
#endif
}
#else
static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
int node)
{
unsigned long *stack;
stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
tsk->stack = stack;
return stack;
}
#endif
- arch_dup_task_struct,copy两个task_struct;
等同与*dst=*src; - clear_tsk_need_resched,清除调度flag
static inline void clear_tsk_need_resched(struct task_struct *tsk)
{
clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
}
- set_task_stack_end_magic,设置内核栈顶魔数,每次调度时会检查是否栈溢出
void set_task_stack_end_magic(struct task_struct *tsk)
{
unsigned long *stackend;
stackend = end_of_stack(tsk);
*stackend = STACK_END_MAGIC; /* for overflow detection */
}
#define STACK_END_MAGIC 0x57AC6E9D
sched_fork
如调度树,见调度子系统的分析。
copy…
- copy_creds
- copy_semundo
- copy_files
- copy_fs
- copy_sighand
- copy_signal
- copy_mm
- copy_namespaces
- copy_io
- copy_thread_tls
- alloc_pid /分配pid/
init_sigpending(&p->pending);
cgroup_fork;cgropu子系统fork
请见本人cgroup子系统详解。
wake_up_new_task
void wake_up_new_task(struct task_struct *p)
{
struct rq_flags rf;
struct rq *rq;
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
* - cpus_allowed can change in the fork path
* - any previously selected CPU might disappear through hotplug
*
* Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
* as we're not fully set-up yet.
*/
p->recent_used_cpu = task_cpu(p);
__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
rq = __task_rq_lock(p, &rf);
update_rq_clock(rq);
post_init_entity_util_avg(&p->se); /*初始化task_struct的调度实体*/
activate_task(rq, p, ENQUEUE_NOCLOCK); /*激活task*/
p->on_rq = TASK_ON_RQ_QUEUED;
trace_sched_wakeup_new(p);
check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
/*
* Nothing relies on rq->lock after this, so its fine to
* drop it.
*/
rq_unpin_lock(rq, &rf);
p->sched_class->task_woken(rq, p);
rq_repin_lock(rq, &rf);
}
#endif
task_rq_unlock(rq, p, &rf);
}
vfork等待子进程执行完毕
if (clone_flags & CLONE_VFORK) {
if (!wait_for_vfork_done(p, &vfork))
ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
}
流程看完了,现在来回答几个问题;
pthread_create
pthread_create其实是堆clone系统调用的封装。看源码,关注clone_flags和栈空间的分配。
see code:
根据版本,pthread_create选择不同的函数。pthread_create.c中
versioned_symbol (libpthread, __pthread_create_2_1, pthread_create, GLIBC_2_1);
来看__pthread_create_2_1函数
__pthread_create_2_1调用linux/createthread.c中create_thread函数。
static int
create_thread (struct pthread *pd, const struct pthread_attr *attr,
bool stopped_start, STACK_VARIABLES_PARMS, bool *thread_ran)
{
/* We rely heavily on various flags the CLONE function understands:
CLONE_VM, CLONE_FS, CLONE_FILES
These flags select semantics with shared address space and
file descriptors according to what POSIX requires.
CLONE_SIGHAND, CLONE_THREAD
This flag selects the POSIX signal semantics and various
other kinds of sharing (itimers, POSIX timers, etc.).
CLONE_SETTLS
The sixth parameter to CLONE determines the TLS area for the
new thread.
CLONE_PARENT_SETTID
The kernels writes the thread ID of the newly created thread
into the location pointed to by the fifth parameters to CLONE.
Note that it would be semantically equivalent to use
CLONE_CHILD_SETTID but it is be more expensive in the kernel.
CLONE_CHILD_CLEARTID
The kernels clears the thread ID of a thread that has called
sys_exit() in the location pointed to by the seventh parameter
to CLONE.
The termination signal is chosen to be zero which means no signal
is sent. */
const int clone_flags = (CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SYSVSEM
| CLONE_SIGHAND | CLONE_THREAD
| CLONE_SETTLS | CLONE_PARENT_SETTID
| CLONE_CHILD_CLEARTID
| 0);
TLS_DEFINE_INIT_TP (tp, pd);
if (__glibc_unlikely (ARCH_CLONE (&start_thread, STACK_VARIABLES_ARGS,
clone_flags, pd, &pd->tid, tp, &pd->tid)
== -1))
........
}
flags:
CLONE_VM :共享地址空间
CLONE_FS :共享文件系统
CLONE_FILES :共享文件描述符
CLONE_SYSVSEM:共享信号量
CLONE_SIGHAND:共享信号处理函数
CLONE_THREAD:创建thread
CLONE_SETTLS :新建tls
CLONE_PARENT_SETTID:设置父亲的线程组id
CLONE_CHILD_CLEARTID:/* clear the TID in the child */
分配栈:
int err = ALLOCATE_STACK (iattr, &pd);
线程锁的实现pthread_mutex
问:pthead_mutex_unlock能立刻唤醒睡眠的进程吗?
linux有两个调度器,核心调度器,和周期性调度器,周期性调度器只负责检查是否需要调度,设置标志位,正真实现上下文切换的是核心调度器。
我们有同事在while(1)中unlock完立马lock,发现其他线程获取锁阻塞很久,追踪发现,该锁在while1中释放后又被该线程获得了,这是怎么回事呢?let‘s trace!
pthead_mutex_unlock函数源码分析:
pthread_mutex_t结构体
typedef union
{
struct __pthread_mutex_s
{
int __lock;
unsigned int __count;
int __owner;
unsigned int __nusers;
int __kind; /*类型*/
int __spins;
__pthread_list_t __list;
#define __PTHREAD_MUTEX_HAVE_PREV 1
} __data;
char __size[__SIZEOF_PTHREAD_MUTEX_T];
long int __align;
} pthread_mutex_t;
int
internal_function attribute_hidden
__pthread_mutex_unlock_usercnt (pthread_mutex_t *mutex, int decr)
{
int type = PTHREAD_MUTEX_TYPE_ELISION (mutex);
if (__builtin_expect (type &
~(PTHREAD_MUTEX_KIND_MASK_NP|PTHREAD_MUTEX_ELISION_FLAGS_NP), 0))
return __pthread_mutex_unlock_full (mutex, decr);
if (__builtin_expect (type, PTHREAD_MUTEX_TIMED_NP)
== PTHREAD_MUTEX_TIMED_NP)
{
/* Always reset the owner field. */
normal:
mutex->__data.__owner = 0;
if (decr)
/* One less user. */
--mutex->__data.__nusers;
/* Unlock. */
lll_unlock (mutex->__data.__lock, PTHREAD_MUTEX_PSHARED (mutex));
LIBC_PROBE (mutex_release, 1, mutex);
return 0;
}
else if (__glibc_likely (type == PTHREAD_MUTEX_TIMED_ELISION_NP))
{
/* Don't reset the owner/users fields for elision. */
return lll_unlock_elision (mutex->__data.__lock, mutex->__data.__elision,
PTHREAD_MUTEX_PSHARED (mutex));
}
else if (__builtin_expect (PTHREAD_MUTEX_TYPE (mutex)
== PTHREAD_MUTEX_RECURSIVE_NP, 1))
{
/* Recursive mutex. */
if (mutex->__data.__owner != THREAD_GETMEM (THREAD_SELF, tid))
return EPERM;
if (--mutex->__data.__count != 0)
/* We still hold the mutex. */
return 0;
goto normal;
}
else if (__builtin_expect (PTHREAD_MUTEX_TYPE (mutex)
== PTHREAD_MUTEX_ADAPTIVE_NP, 1))
goto normal;
else
{
/* Error checking mutex. */
assert (type == PTHREAD_MUTEX_ERRORCHECK_NP);
if (mutex->__data.__owner != THREAD_GETMEM (THREAD_SELF, tid)
|| ! lll_islocked (mutex->__data.__lock))
return EPERM;
goto normal;
}
}
static int
internal_function
__pthread_mutex_unlock_full (pthread_mutex_t *mutex, int decr)
{
int newowner = 0;
switch (PTHREAD_MUTEX_TYPE (mutex))
{
case PTHREAD_MUTEX_ROBUST_RECURSIVE_NP:
/* Recursive mutex. */
if ((mutex->__data.__lock & FUTEX_TID_MASK)
== THREAD_GETMEM (THREAD_SELF, tid)
&& __builtin_expect (mutex->__data.__owner
== PTHREAD_MUTEX_INCONSISTENT, 0))
{
if (--mutex->__data.__count != 0)
/* We still hold the mutex. */
return ENOTRECOVERABLE;
goto notrecoverable;
}
if (mutex->__data.__owner != THREAD_GETMEM (THREAD_SELF, tid))
return EPERM;
if (--mutex->__data.__count != 0)
/* We still hold the mutex. */
return 0;
goto robust;
case PTHREAD_MUTEX_ROBUST_ERRORCHECK_NP:
case PTHREAD_MUTEX_ROBUST_NORMAL_NP:
case PTHREAD_MUTEX_ROBUST_ADAPTIVE_NP:
if ((mutex->__data.__lock & FUTEX_TID_MASK)
!= THREAD_GETMEM (THREAD_SELF, tid)
|| ! lll_islocked (mutex->__data.__lock))
return EPERM;
/* If the previous owner died and the caller did not succeed in
making the state consistent, mark the mutex as unrecoverable
and make all waiters. */
if (__builtin_expect (mutex->__data.__owner
== PTHREAD_MUTEX_INCONSISTENT, 0))
notrecoverable:
newowner = PTHREAD_MUTEX_NOTRECOVERABLE;
robust:
/* Remove mutex from the list. */
THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending,
&mutex->__data.__list.__next);
DEQUEUE_MUTEX (mutex);
mutex->__data.__owner = newowner;
if (decr)
/* One less user. */
--mutex->__data.__nusers;
/* Unlock. */
lll_robust_unlock (mutex->__data.__lock,
PTHREAD_ROBUST_MUTEX_PSHARED (mutex));
THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
break;
/* The PI support requires the Linux futex system call. If that's not
available, pthread_mutex_init should never have allowed the type to
be set. So it will get the default case for an invalid type. */
#ifdef __NR_futex
case PTHREAD_MUTEX_PI_RECURSIVE_NP:
/* Recursive mutex. */
if (mutex->__data.__owner != THREAD_GETMEM (THREAD_SELF, tid))
return EPERM;
if (--mutex->__data.__count != 0)
/* We still hold the mutex. */
return 0;
goto continue_pi_non_robust;
case PTHREAD_MUTEX_PI_ROBUST_RECURSIVE_NP:
/* Recursive mutex. */
if ((mutex->__data.__lock & FUTEX_TID_MASK)
== THREAD_GETMEM (THREAD_SELF, tid)
&& __builtin_expect (mutex->__data.__owner
== PTHREAD_MUTEX_INCONSISTENT, 0))
{
if (--mutex->__data.__count != 0)
/* We still hold the mutex. */
return ENOTRECOVERABLE;
goto pi_notrecoverable;
}
if (mutex->__data.__owner != THREAD_GETMEM (THREAD_SELF, tid))
return EPERM;
if (--mutex->__data.__count != 0)
/* We still hold the mutex. */
return 0;
goto continue_pi_robust;
case PTHREAD_MUTEX_PI_ERRORCHECK_NP:
case PTHREAD_MUTEX_PI_NORMAL_NP:
case PTHREAD_MUTEX_PI_ADAPTIVE_NP:
case PTHREAD_MUTEX_PI_ROBUST_ERRORCHECK_NP:
case PTHREAD_MUTEX_PI_ROBUST_NORMAL_NP:
case PTHREAD_MUTEX_PI_ROBUST_ADAPTIVE_NP:
if ((mutex->__data.__lock & FUTEX_TID_MASK)
!= THREAD_GETMEM (THREAD_SELF, tid)
|| ! lll_islocked (mutex->__data.__lock))
return EPERM;
/* If the previous owner died and the caller did not succeed in
making the state consistent, mark the mutex as unrecoverable
and make all waiters. */
if ((mutex->__data.__kind & PTHREAD_MUTEX_ROBUST_NORMAL_NP) != 0
&& __builtin_expect (mutex->__data.__owner
== PTHREAD_MUTEX_INCONSISTENT, 0))
pi_notrecoverable:
newowner = PTHREAD_MUTEX_NOTRECOVERABLE;
if ((mutex->__data.__kind & PTHREAD_MUTEX_ROBUST_NORMAL_NP) != 0)
{
continue_pi_robust:
/* Remove mutex from the list.
Note: robust PI futexes are signaled by setting bit 0. */
THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending,
(void *) (((uintptr_t) &mutex->__data.__list.__next)
| 1));
DEQUEUE_MUTEX (mutex);
}
continue_pi_non_robust:
mutex->__data.__owner = newowner;
if (decr)
/* One less user. */
--mutex->__data.__nusers;
/* Unlock. Load all necessary mutex data before releasing the mutex
to not violate the mutex destruction requirements (see
lll_unlock). */
int robust = mutex->__data.__kind & PTHREAD_MUTEX_ROBUST_NORMAL_NP;
int private = (robust
? PTHREAD_ROBUST_MUTEX_PSHARED (mutex)
: PTHREAD_MUTEX_PSHARED (mutex));
/* Unlock the mutex using a CAS unless there are futex waiters or our
TID is not the value of __lock anymore, in which case we let the
kernel take care of the situation. Use release MO in the CAS to
synchronize with acquire MO in lock acquisitions. */
int l = atomic_load_relaxed (&mutex->__data.__lock);
do
{
if (((l & FUTEX_WAITERS) != 0)
|| (l != THREAD_GETMEM (THREAD_SELF, tid)))
{
INTERNAL_SYSCALL_DECL (__err);
INTERNAL_SYSCALL (futex, __err, 2, &mutex->__data.__lock,
__lll_private_flag (FUTEX_UNLOCK_PI, private));
break;
}
}
while (!atomic_compare_exchange_weak_release (&mutex->__data.__lock,
&l, 0));
THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
break;
#endif /* __NR_futex. */
case PTHREAD_MUTEX_PP_RECURSIVE_NP:
/* Recursive mutex. */
if (mutex->__data.__owner != THREAD_GETMEM (THREAD_SELF, tid))
return EPERM;
if (--mutex->__data.__count != 0)
/* We still hold the mutex. */
return 0;
goto pp;
case PTHREAD_MUTEX_PP_ERRORCHECK_NP:
/* Error checking mutex. */
if (mutex->__data.__owner != THREAD_GETMEM (THREAD_SELF, tid)
|| (mutex->__data.__lock & ~ PTHREAD_MUTEX_PRIO_CEILING_MASK) == 0)
return EPERM;
/* FALLTHROUGH */
case PTHREAD_MUTEX_PP_NORMAL_NP:
case PTHREAD_MUTEX_PP_ADAPTIVE_NP:
/* Always reset the owner field. */
pp:
mutex->__data.__owner = 0;
if (decr)
/* One less user. */
--mutex->__data.__nusers;
/* Unlock. Use release MO in the CAS to synchronize with acquire MO in
lock acquisitions. */
int newval;
int oldval = atomic_load_relaxed (&mutex->__data.__lock);
do
{
newval = oldval & PTHREAD_MUTEX_PRIO_CEILING_MASK;
}
while (!atomic_compare_exchange_weak_release (&mutex->__data.__lock,
&oldval, newval));
if ((oldval & ~PTHREAD_MUTEX_PRIO_CEILING_MASK) > 1)
lll_futex_wake (&mutex->__data.__lock, 1,
PTHREAD_MUTEX_PSHARED (mutex));
int oldprio = newval >> PTHREAD_MUTEX_PRIO_CEILING_SHIFT;
LIBC_PROBE (mutex_release, 1, mutex);
return __pthread_tpp_change_priority (oldprio, -1);
default:
/* Correct code cannot set any other type. */
return EINVAL;
}
LIBC_PROBE (mutex_release, 1, mutex);
return 0;
}
如果pthread_mutex_unlock后立马pthread_mutex_lock会怎么样?
看了下内容很庞大设计到futex机制,请听下回分解,谢谢。