小张学linux内核：十一.进程和线程的创建过程

最新推荐文章于 2023-12-10 13:59:53 发布

加油2019

最新推荐文章于 2023-12-10 13:59:53 发布

阅读量670

点赞数 1

分类专栏：小张学inux内核文章标签：进程线程 fork linux

本文链接：https://blog.csdn.net/qq_40036519/article/details/107443445

版权

小张学inux内核专栏收录该内容

23 篇文章 11 订阅

订阅专栏

当然还是从fork和vfork系统调用说起。
fork,vfork,clone系统调用及内核函数kernel_thread都是调用_do_fork函数的。
fork：

_do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);

vfork：要保证子进程先运行，CLONE_VFORK保证。vfork为exec而生，vofrk使用中只能exit而不能return。因为和父进程共享栈空间。

_do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
			0, NULL, NULL, 0);

clone:

_do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls)

clone的flags是根据需要传进来的，且会传进来堆栈地址，用户堆栈，内核栈固定为4K/8K(一页或两页)
kernel_thread：

pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
	return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
		(unsigned long)arg, NULL, NULL, 0);
}

flag说明：

SIGCHLD：创建子进程。fork和vfork都有。这是个信号值，当子进程退出时也会向父进程发送该信号。
CLONE_VM：共享虚拟空间，这是线程和进程的区别，进程有自己独立的地址空间，而线程没有。vfork，clone和kernel_thread都没有独立的地址空间，也就是说vfork和clone创建的都是线程，而kernel_thread创建的是内核线程，只有内核空间没有用户空间，而内核空间不仅你内核线程共享，是所有进/线程都是共享的。
CLONE_VFORK ：
看看定义中的注释

#define CLONE_VFORK	0x00004000
/* set if the parent wants the child to wake it up on mm_release */

翻译：如果父进程想要子进程在mm_release时唤醒它(时使用)，呃。。talk is cheap, show me the “_do_fork” code吧
CLONE_UNTRACED：

#define CLONE_UNTRACED		0x00800000	
/* set if the tracing process can't force CLONE_PTRACE on this clone */

一样看不懂…话不多说，进入_do_fork!

_do_fork

思维导图：

在这里插入图片描述

读代码

1.接口：

long _do_fork(unsigned long clone_flags,
	      unsigned long stack_start,
	      unsigned long stack_size,
	      int __user *parent_tidptr,
	      int __user *child_tidptr,
	      unsigned long tls)

接口参数有：clone_flags; 栈信息，父进程线程组id，子进程程组id。这个栈信息，回想freertos/ucos中创建task，需要自己开辟栈空间，而我们在linux中开发调用pthread_create时并不需要开辟stack啊，这是因为glibc已经帮我们做好了。

ptrace事件注册

上面看到内核线程创建是不需要这个信息的，这个是用来调式追踪，父进程追踪子进程，发送信号。

if (!(clone_flags & CLONE_UNTRACED)) {
		if (clone_flags & CLONE_VFORK)
			trace = PTRACE_EVENT_VFORK;
		else if ((clone_flags & CSIGNAL) != SIGCHLD)
			trace = PTRACE_EVENT_CLONE;
		else
			trace = PTRACE_EVENT_FORK;

		if (likely(!ptrace_event_enabled(current, trace)))
			trace = 0;
	}

copy_process

看思维导图；
步骤：

检查flag冲突
有信号处理则退出
复制task_struct;dup_task_struct
sched_fork()入调度树
copy…creds，fs,files,sighand,mm,namespace,io,thread_tls
init_sigpending(&p->pending);初始化信号
cgroup_fork；cgropu子系统fork

_do_fork中调用

	p = copy_process(clone_flags, stack_start, stack_size,
			 child_tidptr, NULL, trace, tls, NUMA_NO_NODE);

检查flag冲突

/*
	 * Don't allow sharing the root directory with processes in a different
	 * namespace
	 */
	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
		return ERR_PTR(-EINVAL);

	if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
		return ERR_PTR(-EINVAL);

	/*
	 * Thread groups must share signals as well, and detached threads
	 * can only be started up within the thread group.
	 */
	if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
		return ERR_PTR(-EINVAL);

	/*
	 * Shared signal handlers imply shared VM. By way of the above,
	 * thread groups also imply shared VM. Blocking this case allows
	 * for various simplifications in other code.
	 */
	if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
		return ERR_PTR(-EINVAL);

	/*
	 * Siblings of global init remain as zombies on exit since they are
	 * not reaped by their parent (swapper). To solve this and to avoid
	 * multi-rooted process trees, prevent global and container-inits
	 * from creating siblings.
	 */
	if ((clone_flags & CLONE_PARENT) &&
				current->signal->flags & SIGNAL_UNKILLABLE)
		return ERR_PTR(-EINVAL);

	/*
	 * If the new process will be in a different pid or user namespace
	 * do not allow it to share a thread group with the forking task.
	 */
	if (clone_flags & CLONE_THREAD) {
		if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
		    (task_active_pid_ns(current) !=
				current->nsproxy->pid_ns_for_children))
			return ERR_PTR(-EINVAL);
	}

!CLONE_SIGHAND和CLONE_THREAD；创建线程和父进程共享信号处理。
CLONE_SIGHAND和！CLONE_VM；共享信号处理就要共享虚拟地址空间。

有信号挂起待处理则退出

if (signal_pending(current))
		goto fork_out;

dup_task_struct

1.alloc_task_struct_node，分配task_struct, slab分配器，高速专业缓存

static inline struct task_struct *alloc_task_struct_node(int node)
{
	return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
}

alloc_thread_stack_node，分配栈空间，如果定义vmap则在cache_stacks[]数组中找空闲；否则还是在slab专业缓存中找。

# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
{
#ifdef CONFIG_VMAP_STACK
	void *stack;
	int i;

	for (i = 0; i < NR_CACHED_STACKS; i++) {
		struct vm_struct *s;

		s = this_cpu_xchg(cached_stacks[i], NULL);

		if (!s)
			continue;

		/* Clear stale pointers from reused stack. */
		memset(s->addr, 0, THREAD_SIZE);

		tsk->stack_vm_area = s;
		tsk->stack = s->addr;
		return s->addr;
	}

	stack = __vmalloc_node_range(THREAD_SIZE, THREAD_ALIGN,
				     VMALLOC_START, VMALLOC_END,
				     THREADINFO_GFP,
				     PAGE_KERNEL,
				     0, node, __builtin_return_address(0));

	/*
	 * We can't call find_vm_area() in interrupt context, and
	 * free_thread_stack() can be called in interrupt context,
	 * so cache the vm_struct.
	 */
	if (stack) {
		tsk->stack_vm_area = find_vm_area(stack);
		tsk->stack = stack;
	}
	return stack;
#else
	struct page *page = alloc_pages_node(node, THREADINFO_GFP,
					     THREAD_SIZE_ORDER);

	if (likely(page)) {
		tsk->stack = page_address(page);
		return tsk->stack;
	}
	return NULL;
#endif
}
#else
static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
						  int node)
{
	unsigned long *stack;
	stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
	tsk->stack = stack;
	return stack;
}
#endif

arch_dup_task_struct，copy两个task_struct；
等同与*dst=*src;
clear_tsk_need_resched,清除调度flag

static inline void clear_tsk_need_resched(struct task_struct *tsk)
{
	clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
}

set_task_stack_end_magic，设置内核栈顶魔数，每次调度时会检查是否栈溢出

void set_task_stack_end_magic(struct task_struct *tsk)
{
	unsigned long *stackend;

	stackend = end_of_stack(tsk);
	*stackend = STACK_END_MAGIC;	/* for overflow detection */
}
#define STACK_END_MAGIC		0x57AC6E9D

sched_fork

如调度树，见调度子系统的分析。

copy…

copy_creds
copy_semundo
copy_files
copy_fs
copy_sighand
copy_signal
copy_mm
copy_namespaces
copy_io
copy_thread_tls
alloc_pid /分配pid/

init_sigpending(&p->pending);

cgroup_fork；cgropu子系统fork

请见本人cgroup子系统详解。

wake_up_new_task

void wake_up_new_task(struct task_struct *p)
{
	struct rq_flags rf;
	struct rq *rq;

	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
	p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
	/*
	 * Fork balancing, do it here and not earlier because:
	 *  - cpus_allowed can change in the fork path
	 *  - any previously selected CPU might disappear through hotplug
	 *
	 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
	 * as we're not fully set-up yet.
	 */
	p->recent_used_cpu = task_cpu(p);
	__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
	rq = __task_rq_lock(p, &rf);
	update_rq_clock(rq);
	post_init_entity_util_avg(&p->se);             /*初始化task_struct的调度实体*/

	activate_task(rq, p, ENQUEUE_NOCLOCK);   /*激活task*/
	p->on_rq = TASK_ON_RQ_QUEUED; 
	trace_sched_wakeup_new(p);
	check_preempt_curr(rq, p, WF_FORK);      
#ifdef CONFIG_SMP
	if (p->sched_class->task_woken) {
		/*
		 * Nothing relies on rq->lock after this, so its fine to
		 * drop it.
		 */
		rq_unpin_lock(rq, &rf);
		p->sched_class->task_woken(rq, p);          
		rq_repin_lock(rq, &rf);
	}
#endif
	task_rq_unlock(rq, p, &rf);
}

vfork等待子进程执行完毕

if (clone_flags & CLONE_VFORK) {
		if (!wait_for_vfork_done(p, &vfork))
			ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
	}

流程看完了，现在来回答几个问题；

pthread_create

pthread_create其实是堆clone系统调用的封装。看源码，关注clone_flags和栈空间的分配。
see code：
根据版本，pthread_create选择不同的函数。pthread_create.c中

versioned_symbol (libpthread, __pthread_create_2_1, pthread_create, GLIBC_2_1);

来看__pthread_create_2_1函数
__pthread_create_2_1调用linux/createthread.c中create_thread函数。

static int
create_thread (struct pthread *pd, const struct pthread_attr *attr,
	       bool stopped_start, STACK_VARIABLES_PARMS, bool *thread_ran)
{
  /* We rely heavily on various flags the CLONE function understands:
     CLONE_VM, CLONE_FS, CLONE_FILES
	These flags select semantics with shared address space and
	file descriptors according to what POSIX requires.

     CLONE_SIGHAND, CLONE_THREAD
	This flag selects the POSIX signal semantics and various
	other kinds of sharing (itimers, POSIX timers, etc.).

     CLONE_SETTLS
	The sixth parameter to CLONE determines the TLS area for the
	new thread.

     CLONE_PARENT_SETTID
	The kernels writes the thread ID of the newly created thread
	into the location pointed to by the fifth parameters to CLONE.

	Note that it would be semantically equivalent to use
	CLONE_CHILD_SETTID but it is be more expensive in the kernel.

     CLONE_CHILD_CLEARTID
	The kernels clears the thread ID of a thread that has called
	sys_exit() in the location pointed to by the seventh parameter
	to CLONE.

     The termination signal is chosen to be zero which means no signal
     is sent.  */
  const int clone_flags = (CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SYSVSEM
			   | CLONE_SIGHAND | CLONE_THREAD
			   | CLONE_SETTLS | CLONE_PARENT_SETTID
			   | CLONE_CHILD_CLEARTID
			   | 0);

  TLS_DEFINE_INIT_TP (tp, pd);

  if (__glibc_unlikely (ARCH_CLONE (&start_thread, STACK_VARIABLES_ARGS,
				    clone_flags, pd, &pd->tid, tp, &pd->tid)
			== -1))
			........
}

flags：
CLONE_VM ：共享地址空间
CLONE_FS ：共享文件系统
CLONE_FILES ：共享文件描述符
CLONE_SYSVSEM：共享信号量
CLONE_SIGHAND：共享信号处理函数
CLONE_THREAD：创建thread
CLONE_SETTLS ：新建tls
CLONE_PARENT_SETTID：设置父亲的线程组id
CLONE_CHILD_CLEARTID：/* clear the TID in the child */

分配栈：

 int err = ALLOCATE_STACK (iattr, &pd);

线程锁的实现pthread_mutex

问：pthead_mutex_unlock能立刻唤醒睡眠的进程吗？
linux有两个调度器，核心调度器，和周期性调度器，周期性调度器只负责检查是否需要调度，设置标志位，正真实现上下文切换的是核心调度器。
我们有同事在while(1)中unlock完立马lock，发现其他线程获取锁阻塞很久，追踪发现，该锁在while1中释放后又被该线程获得了，这是怎么回事呢？let‘s trace！
pthead_mutex_unlock函数源码分析：
pthread_mutex_t结构体

typedef union
{
  struct __pthread_mutex_s
  {
    int __lock;
    unsigned int __count;
    int __owner;
    unsigned int __nusers;
    int __kind;                 /*类型*/
    int __spins;
    __pthread_list_t __list;
#define __PTHREAD_MUTEX_HAVE_PREV	1
  } __data;
  char __size[__SIZEOF_PTHREAD_MUTEX_T];
  long int __align;
} pthread_mutex_t;

int
internal_function attribute_hidden
__pthread_mutex_unlock_usercnt (pthread_mutex_t *mutex, int decr)
{
  int type = PTHREAD_MUTEX_TYPE_ELISION (mutex);
  if (__builtin_expect (type &
		~(PTHREAD_MUTEX_KIND_MASK_NP|PTHREAD_MUTEX_ELISION_FLAGS_NP), 0))
    return __pthread_mutex_unlock_full (mutex, decr);

  if (__builtin_expect (type, PTHREAD_MUTEX_TIMED_NP)
      == PTHREAD_MUTEX_TIMED_NP)
    {
      /* Always reset the owner field.  */
    normal:
      mutex->__data.__owner = 0;
      if (decr)
	/* One less user.  */
	--mutex->__data.__nusers;

      /* Unlock.  */
      lll_unlock (mutex->__data.__lock, PTHREAD_MUTEX_PSHARED (mutex));

      LIBC_PROBE (mutex_release, 1, mutex);

      return 0;
    }
  else if (__glibc_likely (type == PTHREAD_MUTEX_TIMED_ELISION_NP))
    {
      /* Don't reset the owner/users fields for elision.  */
      return lll_unlock_elision (mutex->__data.__lock, mutex->__data.__elision,
				      PTHREAD_MUTEX_PSHARED (mutex));
    }
  else if (__builtin_expect (PTHREAD_MUTEX_TYPE (mutex)
			      == PTHREAD_MUTEX_RECURSIVE_NP, 1))
    {
      /* Recursive mutex.  */
      if (mutex->__data.__owner != THREAD_GETMEM (THREAD_SELF, tid))
	return EPERM;

      if (--mutex->__data.__count != 0)
	/* We still hold the mutex.  */
	return 0;
      goto normal;
    }
  else if (__builtin_expect (PTHREAD_MUTEX_TYPE (mutex)
			      == PTHREAD_MUTEX_ADAPTIVE_NP, 1))
    goto normal;
  else
    {
      /* Error checking mutex.  */
      assert (type == PTHREAD_MUTEX_ERRORCHECK_NP);
      if (mutex->__data.__owner != THREAD_GETMEM (THREAD_SELF, tid)
	  || ! lll_islocked (mutex->__data.__lock))
	return EPERM;
      goto normal;
    }
}

static int
internal_function
__pthread_mutex_unlock_full (pthread_mutex_t *mutex, int decr)
{
  int newowner = 0;

  switch (PTHREAD_MUTEX_TYPE (mutex))
    {
    case PTHREAD_MUTEX_ROBUST_RECURSIVE_NP:
      /* Recursive mutex.  */
      if ((mutex->__data.__lock & FUTEX_TID_MASK)
	  == THREAD_GETMEM (THREAD_SELF, tid)
	  && __builtin_expect (mutex->__data.__owner
			       == PTHREAD_MUTEX_INCONSISTENT, 0))
	{
	  if (--mutex->__data.__count != 0)
	    /* We still hold the mutex.  */
	    return ENOTRECOVERABLE;

	  goto notrecoverable;
	}

      if (mutex->__data.__owner != THREAD_GETMEM (THREAD_SELF, tid))
	return EPERM;

      if (--mutex->__data.__count != 0)
	/* We still hold the mutex.  */
	return 0;

      goto robust;

    case PTHREAD_MUTEX_ROBUST_ERRORCHECK_NP:
    case PTHREAD_MUTEX_ROBUST_NORMAL_NP:
    case PTHREAD_MUTEX_ROBUST_ADAPTIVE_NP:
      if ((mutex->__data.__lock & FUTEX_TID_MASK)
	  != THREAD_GETMEM (THREAD_SELF, tid)
	  || ! lll_islocked (mutex->__data.__lock))
	return EPERM;

      /* If the previous owner died and the caller did not succeed in
	 making the state consistent, mark the mutex as unrecoverable
	 and make all waiters.  */
      if (__builtin_expect (mutex->__data.__owner
			    == PTHREAD_MUTEX_INCONSISTENT, 0))
      notrecoverable:
	newowner = PTHREAD_MUTEX_NOTRECOVERABLE;

    robust:
      /* Remove mutex from the list.  */
      THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending,
		     &mutex->__data.__list.__next);
      DEQUEUE_MUTEX (mutex);

      mutex->__data.__owner = newowner;
      if (decr)
	/* One less user.  */
	--mutex->__data.__nusers;

      /* Unlock.  */
      lll_robust_unlock (mutex->__data.__lock,
			 PTHREAD_ROBUST_MUTEX_PSHARED (mutex));

      THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
      break;

    /* The PI support requires the Linux futex system call.  If that's not
       available, pthread_mutex_init should never have allowed the type to
       be set.  So it will get the default case for an invalid type.  */
#ifdef __NR_futex
    case PTHREAD_MUTEX_PI_RECURSIVE_NP:
      /* Recursive mutex.  */
      if (mutex->__data.__owner != THREAD_GETMEM (THREAD_SELF, tid))
	return EPERM;

      if (--mutex->__data.__count != 0)
	/* We still hold the mutex.  */
	return 0;
      goto continue_pi_non_robust;

    case PTHREAD_MUTEX_PI_ROBUST_RECURSIVE_NP:
      /* Recursive mutex.  */
      if ((mutex->__data.__lock & FUTEX_TID_MASK)
	  == THREAD_GETMEM (THREAD_SELF, tid)
	  && __builtin_expect (mutex->__data.__owner
			       == PTHREAD_MUTEX_INCONSISTENT, 0))
	{
	  if (--mutex->__data.__count != 0)
	    /* We still hold the mutex.  */
	    return ENOTRECOVERABLE;

	  goto pi_notrecoverable;
	}

      if (mutex->__data.__owner != THREAD_GETMEM (THREAD_SELF, tid))
	return EPERM;

      if (--mutex->__data.__count != 0)
	/* We still hold the mutex.  */
	return 0;

      goto continue_pi_robust;

    case PTHREAD_MUTEX_PI_ERRORCHECK_NP:
    case PTHREAD_MUTEX_PI_NORMAL_NP:
    case PTHREAD_MUTEX_PI_ADAPTIVE_NP:
    case PTHREAD_MUTEX_PI_ROBUST_ERRORCHECK_NP:
    case PTHREAD_MUTEX_PI_ROBUST_NORMAL_NP:
    case PTHREAD_MUTEX_PI_ROBUST_ADAPTIVE_NP:
      if ((mutex->__data.__lock & FUTEX_TID_MASK)
	  != THREAD_GETMEM (THREAD_SELF, tid)
	  || ! lll_islocked (mutex->__data.__lock))
	return EPERM;

      /* If the previous owner died and the caller did not succeed in
	 making the state consistent, mark the mutex as unrecoverable
	 and make all waiters.  */
      if ((mutex->__data.__kind & PTHREAD_MUTEX_ROBUST_NORMAL_NP) != 0
	  && __builtin_expect (mutex->__data.__owner
			       == PTHREAD_MUTEX_INCONSISTENT, 0))
      pi_notrecoverable:
       newowner = PTHREAD_MUTEX_NOTRECOVERABLE;

      if ((mutex->__data.__kind & PTHREAD_MUTEX_ROBUST_NORMAL_NP) != 0)
	{
	continue_pi_robust:
	  /* Remove mutex from the list.
	     Note: robust PI futexes are signaled by setting bit 0.  */
	  THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending,
			 (void *) (((uintptr_t) &mutex->__data.__list.__next)
				   | 1));
	  DEQUEUE_MUTEX (mutex);
	}

    continue_pi_non_robust:
      mutex->__data.__owner = newowner;
      if (decr)
	/* One less user.  */
	--mutex->__data.__nusers;

      /* Unlock.  Load all necessary mutex data before releasing the mutex
	 to not violate the mutex destruction requirements (see
	 lll_unlock).  */
      int robust = mutex->__data.__kind & PTHREAD_MUTEX_ROBUST_NORMAL_NP;
      int private = (robust
		     ? PTHREAD_ROBUST_MUTEX_PSHARED (mutex)
		     : PTHREAD_MUTEX_PSHARED (mutex));
      /* Unlock the mutex using a CAS unless there are futex waiters or our
	 TID is not the value of __lock anymore, in which case we let the
	 kernel take care of the situation.  Use release MO in the CAS to
	 synchronize with acquire MO in lock acquisitions.  */
      int l = atomic_load_relaxed (&mutex->__data.__lock);
      do
	{
	  if (((l & FUTEX_WAITERS) != 0)
	      || (l != THREAD_GETMEM (THREAD_SELF, tid)))
	    {
	      INTERNAL_SYSCALL_DECL (__err);
	      INTERNAL_SYSCALL (futex, __err, 2, &mutex->__data.__lock,
				__lll_private_flag (FUTEX_UNLOCK_PI, private));
	      break;
	    }
	}
      while (!atomic_compare_exchange_weak_release (&mutex->__data.__lock,
						    &l, 0));

      THREAD_SETMEM (THREAD_SELF, robust_head.list_op_pending, NULL);
      break;
#endif  /* __NR_futex.  */

    case PTHREAD_MUTEX_PP_RECURSIVE_NP:
      /* Recursive mutex.  */
      if (mutex->__data.__owner != THREAD_GETMEM (THREAD_SELF, tid))
	return EPERM;

      if (--mutex->__data.__count != 0)
	/* We still hold the mutex.  */
	return 0;
      goto pp;

    case PTHREAD_MUTEX_PP_ERRORCHECK_NP:
      /* Error checking mutex.  */
      if (mutex->__data.__owner != THREAD_GETMEM (THREAD_SELF, tid)
	  || (mutex->__data.__lock & ~ PTHREAD_MUTEX_PRIO_CEILING_MASK) == 0)
	return EPERM;
      /* FALLTHROUGH */

    case PTHREAD_MUTEX_PP_NORMAL_NP:
    case PTHREAD_MUTEX_PP_ADAPTIVE_NP:
      /* Always reset the owner field.  */
    pp:
      mutex->__data.__owner = 0;

      if (decr)
	/* One less user.  */
	--mutex->__data.__nusers;

      /* Unlock.  Use release MO in the CAS to synchronize with acquire MO in
	 lock acquisitions.  */
      int newval;
      int oldval = atomic_load_relaxed (&mutex->__data.__lock);
      do
	{
	  newval = oldval & PTHREAD_MUTEX_PRIO_CEILING_MASK;
	}
      while (!atomic_compare_exchange_weak_release (&mutex->__data.__lock,
						    &oldval, newval));

      if ((oldval & ~PTHREAD_MUTEX_PRIO_CEILING_MASK) > 1)
	lll_futex_wake (&mutex->__data.__lock, 1,
			PTHREAD_MUTEX_PSHARED (mutex));

      int oldprio = newval >> PTHREAD_MUTEX_PRIO_CEILING_SHIFT;

      LIBC_PROBE (mutex_release, 1, mutex);

      return __pthread_tpp_change_priority (oldprio, -1);

    default:
      /* Correct code cannot set any other type.  */
      return EINVAL;
    }

  LIBC_PROBE (mutex_release, 1, mutex);
  return 0;
}