原文地址:http://locklessinc.com/articles/locks/ 原文太长,本文翻译其中后半部分 read-write 锁
前半部分Spinlock 锁见: http://blog.csdn.net/k_cnoize/article/details/54946030
大部分情况下,使用一个数据结构时并不会对其进行修改。而是只需要一个区段的读取权限来完成工作。如果有多个线程需要读取某一个数据,没有理由不让它们并发的进行读取。Spinlock 锁无法区分只有读以及读写混合的场景,因为 spinlock 锁无法满足这种潜在的并行操作。为了实现该并行操作,我们需要读写锁。
typedef struct dumbrwlock dumbrwlock;
struct dumbrwlock
{
spinlock lock;
unsigned readers;
};
static void dumb_wrlock(dumbrwlock *l)
{
/* 获取写锁,获得后新的读请求等待 */
spin_lock(&l->lock);
/* 等待之前的读请求完成 */
while (l->readers) cpu_relax();
}
static void dumb_wrunlock(dumbrwlock *l)
{
spin_unlock(&l->lock);
}
static int dumb_wrtrylock(dumbrwlock *l)
{
/* Want no readers */
if (l->readers) return EBUSY;
/* Try to get write lock */
if (spin_trylock(&l->lock)) return EBUSY;
if (l->readers)
{
/* Oops, a reader started */
spin_unlock(&l->lock);
return EBUSY;
}
/* Success! */
return 0;
}
static void dumb_rdlock(dumbrwlock *l)
{
while (1)
{
/* 增一获取读锁 */
atomic_inc(&l->readers);
/* Success? */
if (!l->lock) return;
/* 有写锁加锁,解锁读锁 */
atomic_dec(&l->readers);
while (l->lock) cpu_relax();
}
}
static void dumb_rdunlock(dumbrwlock *l)
{
atomic_dec(&l->readers);
}
static int dumb_rdtrylock(dumbrwlock *l)
{
/* Speculatively take read lock */
atomic_inc(&l->readers);
/* Success? */
if (!l->lock) return 0;
/* Failure - undo */
atomic_dec(&l->readers);
return EBUSY;
}
static int dumb_rdupgradelock(dumbrwlock *l)
{
/* 升级为写锁 */
if (spin_trylock(&l->lock)) return EBUSY;
/* I'm no longer a reader */
atomic_dec(&l->readers);
/* Wait for all other readers to finish */
while (l->readers) cpu_relax();
return 0;
}
作为评价以上代码的标准,比起 spinlock 我们需要一些额外的信息。读请求的比例是一个很重要的因素。读请求越多,我们应该能够并发更多的线程,代码的速度也应该更快。读请求与写请求的随机分布也很重要,像真实的读写场景一样。因此我们使用了一个并发随机数生成器,通过在一个字节(256位)中随机的选择 1,25,128或者250个位,我们能够模拟从大部分请求为读请求到大部分请求为写请求的场景。最后,我们需要观测对于竞争的处理效果。通常情况下,在竞争激烈的场景写锁更容易被用到,所以我们只查看线程数等于处理器核数的场景。
以上 dumb 锁算法在没有竞争的场景表现的非常差,如果只使用一个线程我们有结果:
Writers per 256 | 1 | 25 | 128 | 250 |
---|---|---|---|---|
Time(s) | 3.7 | 3.8 | 4.6 | 5.4 |
如同预期,当写请求比例上升时,读写锁的表现趋近与 spinlock 锁的表现。尽管有竞争,dumb 锁算法实际上表现的非常好,在 4 个线程的情况下:
Writers per 256 | 1 | 25 | 128 | 250 |
---|---|---|---|---|
Time(s) | 1.1 | 1.9 | 4.4 | 5.7 |
一个很明显的改进方法是用 ticketlock 算法来替换很慢的 spinlock 锁。我们有:
typedef struct dumbtrwlock dumbtrwlock;
struct dumbtrwlock
{
ticketlock lock;
unsigned readers;
};
static void dumbt_wrlock(dumbtrwlock *l)
{
/* Get lock */
ticket_lock(&l->lock);
/* Wait for readers to finish */
while (l->readers) cpu_relax();
}
static void dumbt_wrunlock(dumbtrwlock *l)
{
ticket_unlock(&l->lock);
}
static int dumbt_wrtrylock(dumbtrwlock *l)
{
/* Want no readers */
if (l->readers) return EBUSY;
/* Try to get write lock */
if (ticket_trylock(&l->lock)) return EBUSY;
if (l->readers)
{
/* Oops, a reader started */
ticket_unlock(&l->lock);
return EBUSY;
}
/* Success! */
return 0;
}
static void dumbt_rdlock(dumbtrwlock *l)
{
while (1)
{
/* Success? */
if (ticket_lockable(&l->lock))
{
/* Speculatively take read lock */
atomic_inc(&l->readers);
/* Success? */
if (ticket_lockable(&l->lock)) return;
/* Failure - undo, and wait until we can try again */
atomic_dec(&l->readers);
}
while (!ticket_lockable(&l->lock)) cpu_relax();
}
}
static void dumbt_rdunlock(dumbtrwlock *l)
{
atomic_dec(&l->readers);
}
static int dumbt_rdtrylock(dumbtrwlock *l)
{
/* Speculatively take read lock */
atomic_inc(&l->readers);
/* Success? */
if (ticket_lockable(&l->lock)) return 0;
/* Failure - undo */
atomic_dec(&l->readers);
return EBUSY;
}
static int dumbt_rdupgradelock(dumbtrwlock *l)
{
/* Try to convert into a write lock */
if (ticket_trylock(&l->lock)) return EBUSY;
/* I'm no longer a reader */
atomic_dec(&l->readers);
/* Wait for all other readers to finish */
while (l->readers) cpu_relax();
return 0;
}
这个算法在竞争激烈的场景表现的更好,在全部为写请求时花费了 3.7s。然而在在非拥塞场景确没有优势:
Writers per 256 | 1 | 25 | 128 | 250 |
---|---|---|---|---|
Time(s) | 2.0 | 2.5 | 3.7 | 4.5 |
在低写请求比例的场景该算法更慢,在高写请求比例场景更快。而我们使用读写锁的大部分场景写的比例都是低的,这是对该算法不理的地方。其将会比它的竞争者慢两倍。
为了减少冲突,来获取速度。我们来探究下一个很复杂的算法实现,在 Reactos 中用来模拟 Microsoft Window 的 slim read-write (SRW)锁。其使用一个等待队列,和一个位锁来控制其等待队列的处理。它设计让等待者在不同的内存区段空转来处理更多线程的场景。
/* Have a wait block */
#define SRWLOCK_WAIT 1
/* Users are readers */
#define SRWLOCK_SHARED 2
/* Bit-lock for editing the wait block */
#define SRWLOCK_LOCK 4
#define SRWLOCK_LOCK_BIT 2
/* Mask for the above bits */
#define SRWLOCK_MASK 7
/* Number of current users * 8 */
#define SRWLOCK_USERS 8
typedef struct srwlock srwlock;
struct srwlock
{
uintptr_t p;
};
typedef struct srw_sw srw_sw;
struct srw_sw
{
uintptr_t spin;
srw_sw *next;
};
typedef struct srw_wb srw_wb;
struct srw_wb
{
/* s_count is the number of shared acquirers * SRWLOCK_USERS. */
uintptr_t s_count;
/* Last points to the last wait block in the chain. The value
is only valid when read from the first wait block. */
srw_wb *last;
/* Next points to the next wait block in the chain. */
srw_wb *next;
/* The wake chain is only valid for shared wait blocks */
srw_sw *wake;
srw_sw *last_shared;
int ex;
};
/* Wait for control of wait block */
static srw_wb *lock_wb(srwlock *l)
{
uintptr_t p;
/* Spin on the wait block bit lock */
while (atomic_bitsetandtest(&l->p, SRWLOCK_LOCK_BIT)) cpu_relax();
p = l->p;
barrier();
if (!(p & SRWLOCK_WAIT))
{
/* Oops, looks like the wait block was removed. */
atomic_clear_bit(&l->p, SRWLOCK_LOCK_BIT);
return NULL;
}
return (srw_wb *)(p & ~SRWLOCK_MASK);
}
static void srwlock_init(srwlock *l)
{
l->p = 0;
}
static void srwlock_rdlock(srwlock *l)
{
srw_wb swblock;
srw_sw sw;
uintptr_t p;
srw_wb *wb, *shared;
while (1)
{
barrier();
p = l->p;
cpu_relax();
if (!p)
{
/* This is a fast path, we can simply try to set the shared count to 1 */
if (!cmpxchg(&l->p, 0, SRWLOCK_USERS | SRWLOCK_SHARED)) return;
continue;
}
/* Don't interfere with locking */
if (p & SRWLOCK_LOCK) continue;
if (p & SRWLOCK_SHARED)
{
if (!(p & SRWLOCK_WAIT))
{
/* This is a fast path, just increment the number of current shared locks */
if (cmpxchg(&l->p, p, p + SRWLOCK_USERS) == p) return;
}
else
{
/* There's other waiters already, lock the wait blocks and increment the shared count */
wb = lock_wb(l);
if (wb) break;
}
continue;
}
/* Initialize wait block */
swblock.ex = FALSE;
swblock.next = NULL;
swblock.last = &swblock;
swblock.wake = &sw;
sw.next = NULL;
sw.spin = 0;
if (!(p & SRWLOCK_WAIT))
{
/*
* We need to setup the first wait block.
* Currently an exclusive lock is held, change the lock to contended mode.
*/
swblock.s_count = SRWLOCK_USERS;
swblock.last_shared = &sw;
if (cmpxchg(&l->p, p, (uintptr_t)&swblock | SRWLOCK_WAIT) == p)
{
while (!sw.spin) cpu_relax();
return;
}
continue;
}
/* Handle the contended but not shared case */
/*
* There's other waiters already, lock the wait blocks and increment the shared count.
* If the last block in the chain is an exclusive lock, add another block.
*/
swblock.s_count = 0;
wb = lock_wb(l);
if (!wb) continue;
shared = wb->last;
if (shared->ex)
{
shared->next = &swblock;
wb->last = &swblock;
shared = &swblock;
}
else
{
shared->last_shared->next = &sw;
}
shared->s_count += SRWLOCK_USERS;
shared->last_shared = &sw;
/* Unlock */
barrier();
l->p &= ~SRWLOCK_LOCK;
/* Wait to be woken */
while (!sw.spin) cpu_relax();
return;
}
/* The contended and shared case */
sw.next = NULL;
sw.spin = 0;
if (wb->ex)
{
/*
* We need to setup a new wait block.
* Although we're currently in a shared lock and we're acquiring
* a shared lock, there are exclusive locks queued in between.
* We need to wait until those are released.
*/
shared = wb->last;
if (shared->ex)
{
swblock.ex = FALSE;
swblock.s_count = SRWLOCK_USERS;
swblock.next = NULL;
swblock.last = &swblock;
swblock.wake = &sw;
swblock.last_shared = &sw;
shared->next = &swblock;
wb->last = &swblock;
}
else
{
shared->last_shared->next = &sw;
shared->s_count += SRWLOCK_USERS;
shared->last_shared = &sw;
}
}
else
{
wb->last_shared->next = &sw;
wb->s_count += SRWLOCK_USERS;
wb->last_shared = &sw;
}
/* Unlock */
barrier();
l->p &= ~SRWLOCK_LOCK;
/* Wait to be woken */
while (!sw.spin) cpu_relax();
}
static void srwlock_rdunlock(srwlock *l)
{
uintptr_t p, np;
srw_wb *wb;
srw_wb *next;
while (1)
{
barrier();
p = l->p;
cpu_relax();
if (p & SRWLOCK_WAIT)
{
/*
* There's a wait block, we need to wake a pending exclusive acquirer,
* if this is the last shared release.
*/
wb = lock_wb(l);
if (wb) break;
continue;
}
/* Don't interfere with locking */
if (p & SRWLOCK_LOCK) continue;
/*
* This is a fast path, we can simply decrement the shared
* count and store the pointer
*/
np = p - SRWLOCK_USERS;
/* If we are the last reader, then the lock is unused */
if (np == SRWLOCK_SHARED) np = 0;
/* Try to release the lock */
if (cmpxchg(&l->p, p, np) == p) return;
}
wb->s_count -= SRWLOCK_USERS;
if (wb->s_count)
{
/* Unlock */
barrier();
l->p &= ~SRWLOCK_LOCK;
return;
}
next = wb->next;
if (next)
{
/*
* There's more blocks chained, we need to update the pointers
* in the next wait block and update the wait block pointer.
*/
np = (uintptr_t)next | SRWLOCK_WAIT;
next->last = wb->last;
}
else
{
/* Convert the lock to a simple exclusive lock. */
np = SRWLOCK_USERS;
}
barrier();
/* This also unlocks wb lock bit */
l->p = np;
barrier();
wb->wake = (void *) 1;
barrier();
/* We released the lock */
}
static int srwlock_rdtrylock(srwlock *s)
{
uintptr_t p = s->p;
barrier();
/* This is a fast path, we can simply try to set the shared count to 1 */
if (!p && (cmpxchg(&s->p, 0, SRWLOCK_USERS | SRWLOCK_SHARED) == 0)) return 0;
if ((p & (SRWLOCK_SHARED | SRWLOCK_WAIT)) == SRWLOCK_SHARED)
{
/* This is a fast path, just increment the number of current shared locks */
if (cmpxchg(&s->p, p, p + SRWLOCK_USERS) == p) return 0;
}
return EBUSY;
}
static void srwlock_wrlock(srwlock *l)
{
srw_wb swblock;
uintptr_t p, np;
/* Fastpath - no other readers or writers */
if (!l->p && (!cmpxchg(&l->p, 0, SRWLOCK_USERS))) return;
/* Initialize wait block */
swblock.ex = TRUE;
swblock.next = NULL;
swblock.last = &swblock;
swblock.wake = NULL;
while (1)
{
barrier();
p = l->p;
cpu_relax();
if (p & SRWLOCK_WAIT)
{
srw_wb *wb = lock_wb(l);
if (!wb) continue;
/* Complete Initialization of block */
swblock.s_count = 0;
wb->last->next = &swblock;
wb->last = &swblock;
/* Unlock */
barrier();
l->p &= ~SRWLOCK_LOCK;
/* Has our wait block became the first one in the chain? */
while (!swblock.wake) cpu_relax();
return;
}
/* Fastpath - no other readers or writers */
if (!p)
{
if (!cmpxchg(&l->p, 0, SRWLOCK_USERS)) return;
continue;
}
/* Don't interfere with locking */
if (p & SRWLOCK_LOCK) continue;
/* There are no wait blocks so far, we need to add ourselves as the first wait block. */
if (p & SRWLOCK_SHARED)
{
swblock.s_count = p & ~SRWLOCK_MASK;
np = (uintptr_t)&swblock | SRWLOCK_SHARED | SRWLOCK_WAIT;
}
else
{
swblock.s_count = 0;
np = (uintptr_t)&swblock | SRWLOCK_WAIT;
}
/* Try to make change */
if (cmpxchg(&l->p, p, np) == p) break;
}
/* Has our wait block became the first one in the chain? */
while (!swblock.wake) cpu_relax();
}
static void srwlock_wrunlock(srwlock *l)
{
uintptr_t p, np;
srw_wb *wb;
srw_wb *next;
srw_sw *wake, *wake_next;
while (1)
{
barrier();
p = l->p;
cpu_relax();
if (p == SRWLOCK_USERS)
{
/*
* This is the fast path, we can simply clear the SRWLOCK_USERS bit.
* All other bits should be 0 now because this is a simple exclusive lock,
* and no one else is waiting.
*/
if (cmpxchg(&l->p, SRWLOCK_USERS, 0) == SRWLOCK_USERS) return;
continue;
}
/* There's a wait block, we need to wake the next pending acquirer */
wb = lock_wb(l);
if (wb) break;
}
next = wb->next;
if (next)
{
/*
* There's more blocks chained, we need to update the pointers
* in the next wait block and update the wait block pointer.
*/
np = (uintptr_t)next | SRWLOCK_WAIT;
if (!wb->ex)
{
/* Save the shared count */
next->s_count = wb->s_count;
np |= SRWLOCK_SHARED;
}
next->last = wb->last;
}
else
{
/* Convert the lock to a simple lock. */
if (wb->ex)
{
np = SRWLOCK_USERS;
}
else
{
np = wb->s_count | SRWLOCK_SHARED;
}
}
barrier();
/* Also unlocks lock bit */
l->p = np;
barrier();
if (wb->ex)
{
barrier();
/* Notify the next waiter */
wb->wake = (void *) 1;
barrier();
return;
}
/* We now need to wake all others required. */
for (wake = wb->wake; wake; wake = wake_next)
{
barrier();
wake_next = wake->next;
barrier();
wake->spin = 1;
barrier();
}
}
static int srwlock_wrtrylock(srwlock *s)
{
/* No other readers or writers? */
if (!s->p && (cmpxchg(&s->p, 0, SRWLOCK_USERS) == 0)) return 0;
return EBUSY;
}
这并不是在 Reactos 中的真正实现,对其进行了一些简化和清理。一个位标志被移除了,那么它的表现如何呢,在非竞争场景,其和基于 ticket 的读写锁差不多。在 4 个线程竞争的场景表现为:
Writers per 256 | 1 | 25 | 128 | 250 |
---|---|---|---|---|
Time(s) | 2.2 | 3.2 | 5.7 | 6.4 |
这表现的很差,在竞争场景比 dumb 算法更慢。其获得的性能改进并不值得这样复杂的实现。
另外一个可能是用一些位结合读者数目来描述写者的状态。一个类似的技巧在 Linux 内核中被使用来实现其读写锁。是的写者处于一个饥饿的状态,我们有了以下实现:
#define RW_WAIT_BIT 0
#define RW_WRITE_BIT 1
#define RW_READ_BIT 2
#define RW_WAIT 1
#define RW_WRITE 2
#define RW_READ 4
typedef unsigned rwlock;
static void wrlock(rwlock *l)
{
while (1)
{
unsigned state = *l;
/* 没有读者和写者 */
if (state < RW_WRITE)
{
/* 设置状态,加写锁 */
if (cmpxchg(l, state, RW_WRITE) == state) return;
/* 有人并发的加了锁 */
state = *l;
}
/* 设置有写者在等待 */
if (!(state & RW_WAIT)) atomic_set_bit(l, RW_WAIT_BIT);
/* 等待锁被释放 */
while (*l > RW_WAIT) cpu_relax();
}
}
static void wrunlock(rwlock *l)
{ /* 释放写锁 */
atomic_add(l, -RW_WRITE);
}
static int wrtrylock(rwlock *l)
{
unsigned state = *l;
if ((state < RW_WRITE) && (cmpxchg(l, state, state + RW_WRITE) == state)) return 0;
return EBUSY;
}
static void rdlock(rwlock *l)
{
while (1)
{
/* 是否有写锁或写者在等待 */
while (*l & (RW_WAIT | RW_WRITE)) cpu_relax();
/* 获取读锁 */
if (!(atomic_xadd(l, RW_READ) & (RW_WAIT | RW_WRITE))) return;
/* 获取读锁失败,解读锁 */
atomic_add(l, -RW_READ);
}
}
static void rdunlock(rwlock *l)
{
atomic_add(l, -RW_READ);
}
static int rdtrylock(rwlock *l)
{
/* Try to get read lock */
unsigned state = atomic_xadd(l, RW_READ);
if (!(state & (RW_WAIT | RW_WRITE))) return 0;
/* Undo */
atomic_add(l, -RW_READ);
return EBUSY;
}
/* Get a read lock, even if a writer is waiting */
static int rdforcelock(rwlock *l)
{
/* Try to get read lock */
unsigned state = atomic_xadd(l, RW_READ);
/* 即使有写者在等待也可以强制加读锁 */
if (!(state & RW_WRITE)) return 0;
/* Undo */
atomic_add(l, -RW_READ);
return EBUSY;
}
/* Try to upgrade from a read to a write lock atomically */
static int rdtryupgradelock(rwlock *l)
{
/* Someone else is trying (and will succeed) to upgrade to a write lock? */
if (atomic_bitsetandtest(l, RW_WRITE_BIT)) return EBUSY;
/* Don't count myself any more */
atomic_add(l, -RW_READ);
/* Wait until there are no more readers */
while (*l > (RW_WAIT | RW_WRITE)) cpu_relax();
return 0;
}
该锁实现,和使用 ticket 锁作为 spinlock 锁的 dumb 锁算法表现差不多。
Writers per 256 | 1 | 25 | 128 | 250 |
---|---|---|---|---|
Time(s) | 2.0 | 3.4 | 3.9 | 4.6 |
在 Linux 内核中实现的版本是用汇编语言写的,或许会快一些。它使用了一个事实是原子增操作可以用来设置零标志。也就意味着很慢的 add-and-test 方法是不需要的,可以用一个快的两条指令替代。
使用半移动的 C 代码,我们能做的更好。存在一种为读写锁设计的 ticket 锁。RedHat 的 David Howells 在 2002 年给 Linux 内核提供一个实现。其大幅优化了 IBM 的 Joseph Seigh 在90年代初提出的版本。一种类似的算法被 Mellor-Crummey 和 Michael Scott 在他们里程碑式的论文 “Scalable Read-Writer Synchronization for Shared-Memory Multiprocessors”。将其转化为 C 代码实现如下:
typedef union rwticket rwticket;
union rwticket
{
unsigned u;
unsigned short us;
__extension__ struct
{
unsigned char write;
unsigned char read;
unsigned char users;
} s;
};
static void rwticket_wrlock(rwticket *l)
{
unsigned me = atomic_xadd(&l->u, (1<<16));
unsigned char val = me >> 16;
while (val != l->s.write) cpu_relax(); /* ticket 锁在val为特定值时加锁成功 */
}
static void rwticket_wrunlock(rwticket *l)
{
rwticket t = *l;
barrier();
t.s.write++;
t.s.read++;
*(unsigned short *) l = t.us;
}
static int rwticket_wrtrylock(rwticket *l)
{
unsigned me = l->s.users;
unsigned char menew = me + 1;
unsigned read = l->s.read << 8;
unsigned cmp = (me << 16) + read + me;
unsigned cmpnew = (menew << 16) + read + me;
if (cmpxchg(&l->u, cmp, cmpnew) == cmp) return 0;
return EBUSY;
}
static void rwticket_rdlock(rwticket *l)
{
unsigned me = atomic_xadd(&l->u, (1<<16));
unsigned char val = me >> 16;
while (val != l->s.read) cpu_relax();
l->s.read++;
}
static void rwticket_rdunlock(rwticket *l)
{
atomic_inc(&l->s.write);
}
static int rwticket_rdtrylock(rwticket *l)
{
unsigned me = l->s.users;
unsigned write = l->s.write;
unsigned char menew = me + 1;
unsigned cmp = (me << 16) + (me << 8) + write;
unsigned cmpnew = ((unsigned) menew << 16) + (menew << 8) + write;
if (cmpxchg(&l->u, cmp, cmpnew) == cmp) return 0;
return EBUSY;
}
以上读写锁表现的很好,在低写请求比例时比 dumb spinlock 读写锁一样快,在高写请求比例时和 dumb ticketlock 读写锁也几乎一样快。当没有竞争时其也没有性能的下降,在所有的例子花费了 3.7s。
Writers per 256 | 1 | 25 | 128 | 250 |
---|---|---|---|---|
Time(s) | 1.1 | 1.8 | 3.9 | 4.7 |
在主要为读请求的场景,该算法比简单的 spinlock 锁快 5 倍,它的唯一缺点在于不能自动的将读锁升级为写锁(可以做,但是 rwticket_wrunlock 需要使用一个原子操作,会导致其变慢一些)。这个缺点也正是Linux内核没有采用该算法的原因。另一部分原因在于,如果你拥有一个读锁,那么递归的获取读锁总是会成功。然而,如果这个需求不需要的话,那么这个算法是一个很好的选择。
最后需要注意的是该读写 ticket 锁算法并不是最优的,当读者和写者交替的再等待队列出现时,写者(执行),读者1,写者,读者2。这两个读线程可能会被弄混,导致它们能够并发的执行。比如,第二个读者可能并不需要等待第二个写者完成。幸运的是,当线程数少时很少出现该场景。对于4个线程,当读者和写着一样多时其发生概率为 1/16,其余场景则会更小。不幸的是,当线程数增加时,其速度下降的速率比起最优顺序的执行将会是两倍。
修复这个问题一个明显需要做的事是,检测读者需要在等待链表中排队。然而,因为在4个线程并发的场景效果是如此不显著。我们很难以一个很低的代价来做该检查。因此当多核机器变得很常见时,该使用哪种算法将会是一个值得考虑的问题。