没多少行代码,先给出最终实现:
template<typename T>
class CAS {
private:
T *pt_;
T old_val_;
T new_val_;
public:
CAS(T *pt, T old_val, T new_val) {
pt_ = pt;
old_val_ = old_val;
new_val_ = new_val;
while (!__sync_bool_compare_and_swap(pt, old_val, new_val));
}
~CAS() {
if (new_val_ != __sync_val_compare_and_swap(pt_, new_val_, old_val_)) {
fprintf(stderr, "~CAS Error!\n");
}
}
};
template<typename T>
class LockFreeQueue {
private:
T *queue_ptr_;
int64_t mask_;
int64_t __attribute((aligned (0x40))) head_;
int64_t head_count_;
int64_t head_pos_;
int64_t __attribute((aligned (0x40))) tail_;
int64_t tail_count_;
int64_t tail_pos_;
// head_count_ = |[head_pos_, tail_pos_)|
// tail_count_ = |(tail_pos_, head_pos_)|
public:
LockFreeQueue(int64_t size) {
if (size <= 1 || size & (size - 1) != 0) {
fprintf(stderr, "LockFreeQueue Error!");
exit(1);
}
mask_ = size - 1;
queue_ptr_ = new T[size];
head_ = 0;
tail_ = 0;
head_pos_ = 0;
tail_pos_ = size - 1;
head_count_ = (tail_pos_ - head_pos_) & mask_;
tail_count_ = (head_pos_ - tail_pos_ - 1) & mask_;
}
// A full memory barrier around CAS should be guaranteed!
// If queue is full return false,
// otherwise return true.
//ease::Mutex en, de;
bool Enqueue(T el) {
//ease::MutexLock lock(&en);
CAS<int64_t> cas(&head_, 0, 1);
if (head_count_ == 0) {
head_count_ = (tail_pos_ - head_pos_) & mask_;
if (head_count_ == 0) return false;
}
head_count_--;
queue_ptr_[head_pos_] = el;
__sync_synchronize();
head_pos_ = (head_pos_ + 1) & mask_;
return true;
}
// If queue is emtpy return false,
// otherwise return true.
bool Dequeue(T &el) {
//ease::MutexLock lock(&de);
CAS<int64_t> cas(&tail_, 0x00, 0x01);
if (tail_count_ == 0) {
tail_count_ = (head_pos_ - tail_pos_ - 1) & mask_;
if (tail_count_ == 0) return false;
}
tail_count_--;
el = queue_ptr_[(tail_pos_ + 1) & mask_];
__sync_synchronize();
tail_pos_ = (tail_pos_ + 1) & mask_;
return true;
}
};
这个实现的特点是两头分别是串行控制,但可以同步进行。
另外加入head_count_,tail_count_优化,可以减少两头信息同步带来的开销。
实现中几个注意的点:
1、
不大相关的变量组放在不同的cache line中,
这样可以避免它们所在的cache line作无谓的同步。
例如head_,tail_。
2、
一次调用中相关的,甚至是相互依赖的变量可以放在一个cache line中,
把数据集中起来更方便cpu处理,例如head_,head_pos_,head_count_。
3、
把常量性质的变量跟经常需要线程间同步的变量,放在不同的cache line中。
例如queue_ptr_, mask_,它们被读的时候,所在的cache line是不需要去同步的。
以上3点根据测试知是可以提升性能的。
4、
el = queue_ptr_[(tail_pos_ + 1) & mask_];
__sync_synchronize();
tail_pos_ = (tail_pos_ + 1) & mask_;
确保Enqueue线程看到数据出队操作是先于位置标志变化的,
不然标志变化先被Enqueue感知了,就可能导致原先的数据还没出队就被覆盖了。
5、
CAS的系统调用中会有一个full memory产生,这也是正确性的保证。
实例化一个无锁队列(LockFreeQueue<uint64_t> lfq(1 << 10);)
测试1000W个数据进出队列的所用的时间:
1 en-threads, 1 de-threads:
[test0] time consume: 1393ms
[test1] time consume: 1270ms
[test2] time consume: 1374ms
2 en-threads, 2 de-threads:
[test0] time consume: 1366ms
[test1] time consume: 1557ms
[test2] time consume: 1353ms
3 en-threads, 3 de-threads:
[test0] time consume: 3899ms
[test1] time consume: 4409ms
[test2] time consume: 4614ms
4 en-threads, 4 de-threads:
[test0] time consume: 8159ms
[test1] time consume: 8664ms
[test2] time consume: 11048ms
5 en-threads, 5 de-threads:
[test0] time consume: 11903ms
[test1] time consume: 11153ms
[test2] time consume: 12082ms
不难看出随着线程数的增加耗时也大概成线性增加。
另外把CAS换成MutexLock,测试结果是:
1 en-threads, 1 de-threads:
[test0] time consume: 2218ms
[test1] time consume: 2100ms
[test2] time consume: 2159ms
2 en-threads, 2 de-threads:
[test0] time consume: 2305ms
[test1] time consume: 2301ms
[test2] time consume: 2338ms
3 en-threads, 3 de-threads:
[test0] time consume: 2449ms
[test1] time consume: 2631ms
[test2] time consume: 2550ms
4 en-threads, 4 de-threads:
[test0] time consume: 2526ms
[test1] time consume: 2527ms
[test2] time consume: 2524ms
5 en-threads, 5 de-threads:
[test0] time consume: 2523ms
[test1] time consume: 2550ms
[test2] time consume: 2528ms
MutexLock随着线程数打增加,耗时增加没有CAS那么陡,相对比较平缓,
所以MutexLock开了很多线程仍然保持3000ms左右,当然开多了,耗时会越来越大。
可以得出的结论是当线程总数不大于cpu总核数的时候,无锁有一定的优势,
但是线程开多了,线程之间的cpu竞争将带来越来越多的开销。
这个实现比较简单,下来想一种两头都可以并行的做法。