linux内核自旋锁的代码分析

小坚学Linux

已于 2023-09-08 21:40:43 修改

阅读量1k

点赞数

分类专栏： Linux kernel 文章标签： linux 运维服务器

于 2023-01-04 17:16:46 首次发布

本文链接：https://blog.csdn.net/sinat_22338935/article/details/128547070

版权

Linux kernel 专栏收录该内容

60 篇文章 68 订阅

订阅专栏

文章详细介绍了Linux内核中的自旋锁结构、初始化、加锁和解锁操作，特别讨论了在ARMv8架构下利用指令独占机制实现原子性操作。自旋锁通过一系列的宏定义和函数调用来保证并发环境下的线程安全，而在ARMv8中，Load-Exclusive和Store-Exclusive指令用于实现独占访问，避免竞态条件。

摘要由CSDN通过智能技术生成

前面看操作系统导论这本书，其中锁的历史变化有点感悟，现在追一下linux内核锁的代码。

一、自旋锁

1.锁的结构体

系统自旋锁结构体如下：

typedef struct spinlock {
	union {
		struct raw_spinlock rlock;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map))
		struct {
			u8 __padding[LOCK_PADSIZE];
			struct lockdep_map dep_map;
		};
#endif
	};
} spinlock_t;

我们可以看到struct spinlock起始就是一个struct raw_spinlock：

typedef struct raw_spinlock {
	arch_spinlock_t raw_lock;
#ifdef CONFIG_DEBUG_SPINLOCK
	unsigned int magic, owner_cpu;
	void *owner;
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
	struct lockdep_map dep_map;
#endif
} raw_spinlock_t;

继续看可以发现struct raw_spinlock其实也就是arch_spinlock_t ，这就是具体的架构具体实现了。但是很多架构，比如x86，arm64都没有实现，而是使用通用的qspinlock ：

typedef struct qspinlock {
	union {
		atomic_t val;

		/*
		 * By using the whole 2nd least significant byte for the
		 * pending bit, we can allow better optimization of the lock
		 * acquisition for the pending bit holder.
		 */
#ifdef __LITTLE_ENDIAN
		struct {
			u8	locked;
			u8	pending;
		};
		struct {
			u16	locked_pending;
			u16	tail;
		};
#else
		struct {
			u16	tail;
			u16	locked_pending;
		};
		struct {
			u8	reserved[2];
			u8	pending;
			u8	locked;
		};
#endif
	};
} arch_spinlock_t;

struct qspinlock是一个联合体，所以其的本质仍然是一个整数。只是根据大小端的cpu做出不同的内存布局。因为内存是共享的，所以lock->val取出的值是locked + pending 或者 tail + locked_pending。实际上锁和解锁就是上面说的根据cpu的硬件支持，来对这结构体进行指令级别的操作了。

2.锁的初始化

我们是使用spin_lock_init函数来初始化一个自旋锁的：

#define spin_lock_init(_lock)				\
do {							\
	//检查锁的是否存在
	spinlock_check(_lock);				\
	//初始化自旋锁
	raw_spin_lock_init(&(_lock)->rlock);		\
} while (0)

spinlock_check主要是判断锁是否存在，也就是是否申请了锁的内存：

static __always_inline raw_spinlock_t *spinlock_check(spinlock_t *lock)
{
	return &lock->rlock;
}

而raw_spin_lock_init才是真正的初始化锁：

# define raw_spin_lock_init(lock)				\
	do { *(lock) = __RAW_SPIN_LOCK_UNLOCKED(lock); } while (0)

#define __RAW_SPIN_LOCK_UNLOCKED(lockname)	\
	(raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname)

#define __RAW_SPIN_LOCK_INITIALIZER(lockname)	\
	{					\
	.raw_lock = __ARCH_SPIN_LOCK_UNLOCKED,	\
	//SPIN_DEBUG_INIT是初始化自旋锁debug需要的成员
	SPIN_DEBUG_INIT(lockname)		\
	SPIN_DEP_MAP_INIT(lockname) }

SPIN_DEBUG_INIT是初始化自旋锁debug需要的成员，没有开启debug功能则是空函数，我们不必理会：

#ifdef CONFIG_DEBUG_SPINLOCK
# define SPIN_DEBUG_INIT(lockname)		\
	.magic = SPINLOCK_MAGIC,		\
	.owner_cpu = -1,			\
	.owner = SPINLOCK_OWNER_INIT,
#else
# define SPIN_DEBUG_INIT(lockname)
#endif

#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define SPIN_DEP_MAP_INIT(lockname)	.dep_map = { .name = #lockname }
#else
# define SPIN_DEP_MAP_INIT(lockname)
#endif

总结：spin_lock_init最主要做的是判断是否申请了锁的需要的内存，后面的都是debug才需要的初始化操作，没有开启自旋锁的debug则都是空操作。

3.加锁操作

一般加锁操作有以下几种：

spin_lock(lock)//加锁操作，加锁成功后返回，否则一直忙等
spin_lock_irqsave(lock, flags) //加锁操作，并且关闭硬中断
spin_lock_bh(lock) //加锁操作，并且关闭软中断

先看spin_lock，他最终调用的是__raw_spin_lock函数，

static __always_inline void spin_lock(spinlock_t *lock)
{
	raw_spin_lock(&lock->rlock);
}

#define raw_spin_lock(lock)	_raw_spin_lock(lock)
#define _raw_spin_lock(lock) __raw_spin_lock(lock)

static inline void __raw_spin_lock(raw_spinlock_t *lock)
{
	preempt_disable();//关抢占
	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);//检查锁的有效性，一般是空操作
	LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);//参与锁竞争的函数
}

spin_lock调用了__raw_spin_lock函数，在函数中首先关闭抢占，然后检查锁的有效性，最后，也是最重要的一步，调用LOCK_CONTENDED参数锁的竞争：

#define LOCK_CONTENDED(_lock, try, lock)			\
do {								\
	if (!try(_lock)) {					\
		lock_contended(&(_lock)->dep_map, _RET_IP_);	\
		lock(_lock);					\
	}							\
	lock_acquired(&(_lock)->dep_map, _RET_IP_);			\
} while (0)

LOCK_CONTENDED这个宏首先尝试上锁，上锁成功则退出，上锁失败则使用lock_contended去竞争锁，竞争到锁后使用lock，lock实际是do_raw_spin_lock上锁，lock_acquired函数也是检查锁的有效性，我追了一下，发现还是空操作，就不再拿出来了，下面主要看看lock_contended是如何竞争锁的：

void lock_contended(struct lockdep_map *lock, unsigned long ip)
{
	unsigned long flags;

	if (unlikely(!lock_stat || !debug_locks))
		return;

	if (unlikely(current->lockdep_recursion))
		return;

	raw_local_irq_save(flags);//保存irq_flags并且关中断
	check_flags(flags);
	current->lockdep_recursion = 1;//记录锁的状态，防止锁递归
	trace_lock_contended(lock, ip);
	__lock_contended(lock, ip);//真正进行锁竞争的函数
	current->lockdep_recursion = 0;//恢复锁的状态
	raw_local_irq_restore(flags);//恢复irq_flags并且开启中断
}
EXPORT_SYMBOL_GPL(lock_contended);

lock_contended函数首先保存irq_flags并且关中断，然后通过__lock_contended函数真正进行锁竞争，最后通过raw_local_irq_restore函数恢复irq_flags并且开启中断，我们重点看__lock_contended：

static void
__lock_contended(struct lockdep_map *lock, unsigned long ip)
{
	struct task_struct *curr = current;
	struct held_lock *hlock;
	struct lock_class_stats *stats;
	unsigned int depth;
	int i, contention_point, contending_point;

	depth = curr->lockdep_depth;//记录锁的深度
	/*
	 * Whee, we contended on this lock, except it seems we're not
	 * actually trying to acquire anything much at all..
	 */
	if (DEBUG_LOCKS_WARN_ON(!depth))
		return;

	hlock = find_held_lock(curr, lock, depth, &i);
	if (!hlock) {
		print_lock_contention_bug(curr, lock, ip);
		return;
	}

	if (hlock->instance != lock)
		return;

	hlock->waittime_stamp = lockstat_clock();

	contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
	contending_point = lock_point(hlock_class(hlock)->contending_point,
				      lock->ip);

	stats = get_lock_stats(hlock_class(hlock));
	if (contention_point < LOCKSTAT_POINTS)
		stats->contention_point[contention_point]++;
	if (contending_point < LOCKSTAT_POINTS)
		stats->contending_point[contending_point]++;
	if (lock->cpu != smp_processor_id())
		stats->bounces[bounce_contended + !!hlock->read]++;
}

__lock_contended到这里，后面的看不太懂，不过我们可以继续看lock函数，也就是do_raw_spin_lock：

void do_raw_spin_lock(raw_spinlock_t *lock)
{
	debug_spin_lock_before(lock);//输出lock的信息
	arch_spin_lock(&lock->raw_lock);//加锁操作
	debug_spin_lock_after(lock);//输出lock的信息
}

#define arch_spin_lock(l)		queued_spin_lock(l)

static __always_inline void queued_spin_lock(struct qspinlock *lock)
{
	u32 val;

	val = atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL);//如果锁没有被占用，则直接加锁
	if (likely(val == 0))
		return;
	queued_spin_lock_slowpath(lock, val);//如果锁被占用了，慢速排队加锁
}

atomic_cmpxchg_acquire函数：

val = atomic_cmpxchg_acquire(&lock->val, 0, _Q_LOCKED_VAL);//如果锁没有被占用，则直接加锁
 
#define atomic_cmpxchg_acquire(v, old, new)				\
	cmpxchg_acquire(&((v)->counter), (old), (new))
	
#define cmpxchg_acquire(...)	__cmpxchg_wrapper(_acq, __VA_ARGS__)

#define __cmpxchg_wrapper(sfx, ptr, o, n)				\
({									\
	__typeof__(*(ptr)) __ret;					\
	__ret = (__typeof__(*(ptr)))					\
		__cmpxchg##sfx((ptr), (unsigned long)(o),		\
				(unsigned long)(n), sizeof(*(ptr)));	\
	__ret;								\
})

atomic_cmpxchg_acquire作用是比较lock->val与0，相同则设置lock->val = _Q_LOCKED_VAL，并且返回val的旧值，不相同则直接返回val。lock->val是联合体的所有成员的结合，这里判断如果三个部分都为0，即无人持锁，无人持有pending，无人在等待队列，那么直接获取锁。也就是比较lock->val与0，结果相同，然后设置val的locked为1。
不相同则会走queued_spin_lock_slowpath函数：

/**
 * queued_spin_lock_slowpath - acquire the queued spinlock
 * @lock: Pointer to queued spinlock structure
 * @val: Current value of the queued spinlock 32-bit word
 *
 * (queue tail, pending bit, lock value)
 *
 *              fast     :    slow                                  :    unlock
 *                       :                                          :
 * uncontended  (0,0,0) -:--> (0,0,1) ------------------------------:--> (*,*,0)
 *                       :       | ^--------.------.             /  :
 *                       :       v           \      \            |  :
 * pending               :    (0,1,1) +--> (0,1,0)   \           |  :
 *                       :       | ^--'              |           |  :
 *                       :       v                   |           |  :
 * uncontended           :    (n,x,y) +--> (n,0,0) --'           |  :
 *   queue               :       | ^--'                          |  :
 *                       :       v                               |  :
 * contended             :    (*,x,y) +--> (*,0,0) ---> (*,0,1) -'  :
 *   queue               :         ^--'                             :
 */
void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
{
	struct mcs_spinlock *prev, *next, *node;
	u32 old, tail;
	int idx;

	BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));

	if (pv_enabled())
		goto pv_queue;

	if (virt_spin_lock(lock))
		return;

	/*
	 * Wait for in-progress pending->locked hand-overs with a bounded
	 * number of spins so that we guarantee forward progress.
	 *
	 * 0,1,0 -> 0,0,1
	 */
	//如果快速加锁路劲返回的值表示目前已经在pending状态，则再次读取val值，
	if (val == _Q_PENDING_VAL) {
		int cnt = _Q_PENDING_LOOPS;
		val = atomic_cond_read_relaxed(&lock->val,
					       (VAL != _Q_PENDING_VAL) || !cnt--);
	}

	/*
	 * If we observe any contention; queue.
	 */
	//根据刚刚读取到的val值，看看锁是否还在占用，如果还在占用，我们就去排队，如果没有占用了，我们就往下走，看看是否可以上锁
	if (val & ~_Q_LOCKED_MASK)
		goto queue;

	/*
	 * trylock || pending
	 *
	 * 0,0,0 -> 0,0,1 ; trylock
	 * 0,0,1 -> 0,1,1 ; pending
	 */
	//我们这里尝试加锁，返回值有两种状态，一是上锁成功，二是上锁失败，进入pending状态
	val = queued_fetch_set_pending_acquire(lock);

	/*
	 * If we observe any contention; undo and queue.
	 */
	//如果上锁失败，我们就清除pending状态，就去排队
	if (unlikely(val & ~_Q_LOCKED_MASK)) {
		if (!(val & _Q_PENDING_MASK))
			clear_pending(lock);
		goto queue;
	}

	/*
	 * We're pending, wait for the owner to go away.
	 *
	 * 0,1,1 -> 0,1,0
	 *
	 * this wait loop must be a load-acquire such that we match the
	 * store-release that clears the locked bit and create lock
	 * sequentiality; this is because not all
	 * clear_pending_set_locked() implementations imply full
	 * barriers.
	 */
	//走到这里，说明我们竞争锁成功了，我们读取val看看。
	if (val & _Q_LOCKED_MASK)
		atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_MASK));

	/*
	 * take ownership and clear the pending bit.
	 *
	 * 0,1,0 -> 0,0,1
	 */
	clear_pending_set_locked(lock);//我们清除pending 位，设置val为1，表示上锁了。如果有debug，还要设置ownership 
	qstat_inc(qstat_lock_pending, true);//队列锁才用的东西，我们没有使用队列锁，所以这里是空操作
	return;

	/*
	 * End of pending bit optimistic spinning and beginning of MCS
	 * queuing.
	 */
queue:
	qstat_inc(qstat_lock_slowpath, true);//队列锁才用的东西，我们没有使用队列锁，所以这里是空操作
pv_queue:
	node = this_cpu_ptr(&mcs_nodes[0]);
	idx = node->count++;
	tail = encode_tail(smp_processor_id(), idx);

	node += idx;

	/*
	 * Ensure that we increment the head node->count before initialising
	 * the actual node. If the compiler is kind enough to reorder these
	 * stores, then an IRQ could overwrite our assignments.
	 */
	barrier();//内存屏障

	node->locked = 0;
	node->next = NULL;
	pv_init_node(node);//pv锁才用的东西，这里是空操作

	/*
	 * We touched a (possibly) cold cacheline in the per-cpu queue node;
	 * attempt the trylock once more in the hope someone let go while we
	 * weren't watching.
	 */
	if (queued_spin_trylock(lock))//尝试加锁
		goto release;

	/*
	 * Ensure that the initialisation of @node is complete before we
	 * publish the updated tail via xchg_tail() and potentially link
	 * @node into the waitqueue via WRITE_ONCE(prev->next, node) below.
	 */
	smp_wmb();//内存写屏障

	/*
	 * Publish the updated tail.
	 * We have already touched the queueing cacheline; don't bother with
	 * pending stuff.
	 *
	 * p,*,* -> n,*,*
	 */
	old = xchg_tail(lock, tail);//更新tail信息
	next = NULL;

	/*
	 * if there was a previous node; link it and wait until reaching the
	 * head of the waitqueue.
	 */
	if (old & _Q_TAIL_MASK) {
		prev = decode_tail(old);

		/* Link @node into the waitqueue. */
		WRITE_ONCE(prev->next, node);

		pv_wait_node(node, prev);
		arch_mcs_spin_lock_contended(&node->locked);

		/*
		 * While waiting for the MCS lock, the next pointer may have
		 * been set by another lock waiter. We optimistically load
		 * the next pointer & prefetch the cacheline for writing
		 * to reduce latency in the upcoming MCS unlock operation.
		 */
		next = READ_ONCE(node->next);
		if (next)
			prefetchw(next);
	}

	/*
	 * we're at the head of the waitqueue, wait for the owner & pending to
	 * go away.
	 *
	 * *,x,y -> *,0,0
	 *
	 * this wait loop must use a load-acquire such that we match the
	 * store-release that clears the locked bit and create lock
	 * sequentiality; this is because the set_locked() function below
	 * does not imply a full barrier.
	 *
	 * The PV pv_wait_head_or_lock function, if active, will acquire
	 * the lock and return a non-zero value. So we have to skip the
	 * atomic_cond_read_acquire() call. As the next PV queue head hasn't
	 * been designated yet, there is no way for the locked value to become
	 * _Q_SLOW_VAL. So both the set_locked() and the
	 * atomic_cmpxchg_relaxed() calls will be safe.
	 *
	 * If PV isn't active, 0 will be returned instead.
	 *
	 */
	if ((val = pv_wait_head_or_lock(lock, node)))
		goto locked;

	val = atomic_cond_read_acquire(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK));

locked:
	/*
	 * claim the lock:
	 *
	 * n,0,0 -> 0,0,1 : lock, uncontended
	 * *,*,0 -> *,*,1 : lock, contended
	 *
	 * If the queue head is the only one in the queue (lock value == tail)
	 * and nobody is pending, clear the tail code and grab the lock.
	 * Otherwise, we only need to grab the lock.
	 */

	/*
	 * In the PV case we might already have _Q_LOCKED_VAL set.
	 *
	 * The atomic_cond_read_acquire() call above has provided the
	 * necessary acquire semantics required for locking.
	 */
	if (((val & _Q_TAIL_MASK) == tail) &&
	    atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL))
		goto release; /* No contention */

	/* Either somebody is queued behind us or _Q_PENDING_VAL is set */
	set_locked(lock);

	/*
	 * contended path; wait for next if not observed yet, release.
	 */
	if (!next)
		next = smp_cond_load_relaxed(&node->next, (VAL));

	arch_mcs_spin_unlock_contended(&next->locked);
	pv_kick_node(lock, next);

release:
	/*
	 * release the node
	 */
	__this_cpu_dec(mcs_nodes[0].count);
}
EXPORT_SYMBOL(queued_spin_lock_slowpath);

看queued_spin_lock_slowpath函数前面的注释，我们知道锁已经被占用了，val已经不是0了，而是变为1。具体详情请看上面代码的注释，这里不多加说明。

最后我们看看spin_lock_irqsave和spin_lock的区别吧：

#define spin_lock_irqsave(lock, flags)				\
do {								\
	raw_spin_lock_irqsave(spinlock_check(lock), flags);	\
} while (0)

#define raw_spin_lock_irqsave(lock, flags)			\
	do {						\
		typecheck(unsigned long, flags);	\
		flags = _raw_spin_lock_irqsave(lock);	\
	} while (0)

#define _raw_spin_lock_irqsave(lock) __raw_spin_lock_irqsave(lock)

static inline unsigned long __raw_spin_lock_irqsave(raw_spinlock_t *lock)
{
	unsigned long flags;

	local_irq_save(flags);//保存irq flags
	preempt_disable();
	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
	/*
	 * On lockdep we dont want the hand-coded irq-enable of
	 * do_raw_spin_lock_flags() code, because lockdep assumes
	 * that interrupts are not re-enabled during lock-acquire:
	 */
#ifdef CONFIG_LOCKDEP
	LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
#else
	do_raw_spin_lock_flags(lock, &flags);
#endif
	return flags;
}

看到这里我们知道，就是在spin_acquire和LOCK_CONTENDED前进行了local_irq_save，保存保存irq flags。
我们也看看spin_lock_bh和spin_lock的区别吧：

static __always_inline void spin_lock_bh(spinlock_t *lock)
{
	raw_spin_lock_bh(&lock->rlock);
}

#define raw_spin_lock_bh(lock)		_raw_spin_lock_bh(lock)
#define _raw_spin_lock_bh(lock) __raw_spin_lock_bh(lock)

static inline void __raw_spin_lock_bh(raw_spinlock_t *lock)
{
	__local_bh_disable_ip(_RET_IP_, SOFTIRQ_LOCK_OFFSET);
	spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
	LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
}

看到这里我们知道，就是在spin_acquire和LOCK_CONTENDED前进行了__local_bh_disable_ip来关闭软中断，我们看看__local_bh_disable_ip是啥：

void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
{
	unsigned long flags;

	WARN_ON_ONCE(in_irq());

	raw_local_irq_save(flags);
	/*
	 * The preempt tracer hooks into preempt_count_add and will break
	 * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
	 * is set and before current->softirq_enabled is cleared.
	 * We must manually increment preempt_count here and manually
	 * call the trace_preempt_off later.
	 */
	__preempt_count_add(cnt);
	/*
	 * Were softirqs turned off above:
	 */
	if (softirq_count() == (cnt & SOFTIRQ_MASK))
		trace_softirqs_off(ip);
	raw_local_irq_restore(flags);

	if (preempt_count() == cnt) {
#ifdef CONFIG_DEBUG_PREEMPT
		current->preempt_disable_ip = get_lock_parent_ip();
#endif
		trace_preempt_off(CALLER_ADDR0, get_lock_parent_ip());
	}
}

4.解锁操作

spin_unlock(lock)//解锁操作
spin_unlock_irqrestore(lock, flags) //解锁操作，并且打开硬中断
spin_unlock_bh(lock) //解锁操作，并且打开软中断

spin_unlock解锁是没有竞争的，所以解锁就简单很多：

static __always_inline void spin_unlock(spinlock_t *lock)
{
	raw_spin_unlock(&lock->rlock);
}

#define raw_spin_unlock(lock)		_raw_spin_unlock(lock)
#define _raw_spin_unlock(lock) __raw_spin_unlock(lock)

static inline void __raw_spin_unlock(raw_spinlock_t *lock)
{
	spin_release(&lock->dep_map, 1, _RET_IP_);//空操作
	do_raw_spin_unlock(lock);//解锁
	preempt_enable();//开启抢占
}

static inline void do_raw_spin_unlock(raw_spinlock_t *lock)
{
	arch_spin_unlock(&lock->raw_lock);
	__release(lock);
}

static inline void arch_spin_unlock(arch_spinlock_t *lock)
{
	barrier();
	lock->slock = 1;
}

spin_unlock_irqrestore和spin_unlock仅仅是do_raw_spin_unlock之后进行local_irq_restore操作而已：

static __always_inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
{
	raw_spin_unlock_irqrestore(&lock->rlock, flags);
}

#define raw_spin_unlock_irqrestore(lock, flags)		\
	do {							\
		typecheck(unsigned long, flags);		\
		_raw_spin_unlock_irqrestore(lock, flags);	\
	} while (0)

#define _raw_spin_unlock_irqrestore(lock, flags) __raw_spin_unlock_irqrestore(lock, flags)
static inline void __raw_spin_unlock_irqrestore(raw_spinlock_t *lock,
					    unsigned long flags)
{
	spin_release(&lock->dep_map, 1, _RET_IP_);
	do_raw_spin_unlock(lock);
	local_irq_restore(flags);
	preempt_enable();
}

二、armv8架构自旋锁的底层原理——指令独占

ARMv8 架构引入了一种称为 “Exclusive Access” 的指令独占（也称为 “Exclusive Monitors” 或 “Load-Exclusive/Store-Exclusive”）的机制，用于实现多处理器（多核心）系统中的原子性操作。这个机制允许处理器（核心）在访问共享内存时执行原子性读-改-写操作，以防止竞态条件和数据不一致问题，这种机制完成自旋锁最重要的操作——原子加一。

Load-Exclusive (LDXR) 这个指令用于加载内存中的数据到寄存器，并标记该内存位置为 “独占状态”。如果其他处理器在这个指令执行期间访问了相同的内存位置，那么这个指令的执行将失败，指令不会加载数据，并且处理器会得到一个失败的标志。Store-Exclusive (STXR) 这个指令用于将数据从寄存器写回内存，并检查内存位置是否仍然处于 “独占状态”。如果是，写操作成功，内存位置的独占状态被释放。如果其他处理器在这个指令执行期间访问了相同的内存位置，那么写操作将失败，指令不会写入数据，并且处理器会得到一个失败的标志。

由于 LDXR 和 STXR 指令的机制，它们允许处理器在执行这两个指令期间保持内存位置的独占状态，从而实现了原子性操作。这意味着在多理器访问了相同的内存位置），处理器通常会尝试重复执行这些指令，直到成功为止。这种重试机制确保了原子性操作的完成。ARMv8 的指令独占机制有助于避免竞态条件，即多个处理器尝试同时访问和修改共享内存位置的情况。通过将内存位置标记为 “独占状态”，只有一个处理器能够成功地修改它，从而避免了数据不一致性和竞态条件。

我们继续看源代码，linux的源代码翻到头疼也看不懂，因为函数层层包含宏定义太多了，追到最后看到汇编了又要回头找参数是啥我就放弃了。但是，我们可以看ATF或者optee的代码，optee的自旋锁代码很简单，这里是optee3.21中关于自旋锁最底层的汇编代码，代码在core/arch/arm/kernel/spin_lock_a64.S文件中：

/* void __cpu_spin_lock(unsigned int *lock); */
FUNC __cpu_spin_lock , :
	mov	w2, #SPINLOCK_LOCK
	sevl
l1:	wfe
l2:	ldaxr	w1, [x0]
	cbnz	w1, l1
	stxr	w1, w2, [x0]
	cbnz	w1, l2
	ret
END_FUNC __cpu_spin_lock

/* unsigned int __cpu_spin_trylock(unsigned int *lock); */
FUNC __cpu_spin_trylock , :
	mov     x1, x0
	mov     w2, #SPINLOCK_LOCK
.loop:	ldaxr   w0, [x1]
	cbnz    w0, .cpu_spin_trylock_out
	stxr    w0, w2, [x1]
	cbnz    w0, .loop
.cpu_spin_trylock_out:
	ret
END_FUNC __cpu_spin_trylock

/* void __cpu_spin_unlock(unsigned int *lock); */
FUNC __cpu_spin_unlock , :
	stlr	wzr, [x0]
	ret
END_FUNC __cpu_spin_unlock

sevl指令用于实现在多核心处理器上的同步和协调。这个指令的主要作用是向其他处理器核心发送一个事件，用于实现同步和协调，通常是等待或进入休眠状态。
我们看到第一行代码，自旋锁会先把SPINLOCK_LOCK（实际上是1）这个数赋值给w2寄存器，然后执行sevl指令，这个指令会向其他核心发送一个事件，说我要进行同步