【最终版 & 总结】自实现自旋锁与 mutex，spinlock比较（结果令人吃惊）

http://www.9php.com/FAQ/cxsjl/c/2011/08/9244324162620.html

总结一下：
最后修改了一下测试代码，比较方便的设置线程数，选择测试对象，设置delay的参数。
经过测试发现我的实现测试的结果时间非常稳定，2个，10个，20个乃至40个线程的差别很小，几乎达到了与线程数无关，同样mutex也是和线程数关系不大，但是pthread_spinlock_t对线程数敏感，线程多的情况下，效率会降低很多。
同样增加加锁的粒度，对测试结果也有影响，当粒度很小的情况下，我的实现是mutex的4-5倍快，但是粒度很大的情况，比如我设置到delay的循环次数为1000时，效率是mutex的两倍多快，但是cpu更忙些。spinlock的速度就不提了，很低。
结论：
如果是比较大粒度的加锁肯定是mutex首选，虽然性能中庸，但是它因为会休眠挂起，不占用cpu，对系统影响小。
如果是比较细粒度的加锁可以用我实现的lock，对线程数量几乎无关，效率极高，可能是因为实现简单，最大效率的做到了切换cpu，保证一个线程执行，减少了多余环节。
pthread_spinlock_t的局限性太大，如果线程多的情况下，会造成性能的很大程度的损失。同时还仅限于小粒度的加锁情况。

给出测试数据：
1）20线程，0delay
我的：time ./myspinlock_O3.out 20 0 0
real 0m0.323s
user 0m0.364s
sys    0m0.276s

mutex：time ./myspinlock_O3.out 20 1 0
real 0m1.634s
user 0m1.972s
sys    0m1.264s

spinlock: time ./myspinlock_O3.out 20 2 0
real 0m6.259s
user 0m12.477s
sys    0m0.004s

2）20线程，100 delay
我的：time ./myspinlock_O3.out 20 0 100
real 0m2.965s
user 0m3.268s
sys    0m2.636s

mutex：time ./myspinlock_O3.out 20 1 100
real 0m6.493s
user 0m6.344s
sys    0m6.604s

spinlock： time ./myspinlock_O3.out 20 2 100
real 0m15.760s
user 0m31.378s
sys    0m0.004s

3）10线程，0delay
我的：time ./myspinlock_O3.out 10 0 0
real 0m0.318s
user 0m0.372s
sys    0m0.248s
mutex：time ./myspinlock_O3.out 10 1 0
real 0m1.511s
user 0m1.808s
sys    0m1.200s
spinlock：time ./myspinlock_O3.out 10 2 0
real 0m3.625s
user 0m7.224s
sys    0m0.004s

4）2线程，0delay
我的：time ./myspinlock_O3.out 2 0 0
real 0m0.323s
user 0m0.376s
sys    0m0.184s
mutex：time ./myspinlock_O3.out 2 1 0
real 0m1.453s
user 0m1.688s
sys    0m1.136s
spinlock：time ./myspinlock_O3.out 2 2 0
real 0m0.819s
user 0m1.624s
sys    0m0.004s

最终版的实现

#include<stdint.h>
#include<unistd.h>

typedef volatile uint32_t spinlock_t;

#define MY_SPINLOCK_INITIALIZER 0

#define spinlock_lock(lock) do{  \
while(!__sync_bool_compare_and_swap(lock, 0, 1)) \
      sched_yield();  \
}while(0)

#define spinlock_unlock(lock) do{ \
*lock = 0;  \
}while(0)

最终版的测试代码

#include"myspinlock.h"

// gcc -Wall -g -O3 -o myspinlock.out myspinlock.c -lpthread

/ test

my_spinlock_t lock =  MY_SPINLOCK_INITIALIZER;
volatile int cnt = 0;

#include<pthread.h>
#include<stdio.h>
#include <stdlib.h>

#define  TOTAL 1000000 * 20
int NR;
int DELAY_CNT = 100;

void * fun1(void * arg)
{
int i = 0, id = *(int*)arg;
printf("thread:%d\n",id);
for(; i < NR; i++)
{
spinlock_lock(&lock);
cnt++;
int j = 0;
for (; j < DELAY_CNT; j++) {
*foo = (*foo * 33) + 17;
}
spinlock_unlock(&lock);
}
printf("thread:%d over, lock:%d\n",id, lock);
return 0;
}

pthread_mutex_t mlock = PTHREAD_MUTEX_INITIALIZER;

void * fun2(void * arg)
{
int i = 0, id = *(int*)arg;
printf("thread:%d\n",id);
for(; i < NR; i++)
{
pthread_mutex_lock(&mlock);
cnt++;
int j = 0;
for (; j < DELAY_CNT; j++) {
*foo = (*foo * 33) + 17;
}
pthread_mutex_unlock(&mlock);
}
printf("thread:%d over, lock:%d\n",id, lock);
return 0;
}

pthread_spinlock_t splock;

void * fun3(void * arg)
{
int i = 0, id = *(int*)arg;
printf("thread:%d\n",id);
for(; i < NR; i++)
{
pthread_spin_lock(&splock);
cnt++;
int j = 0;
for (; j < DELAY_CNT; j++) {
*foo = (*foo * 33) + 17;
}
pthread_spin_unlock(&splock);
}
printf("thread:%d over, lock:%d\n",id, lock);
return 0;
}

int N = 20;
int main(int c, char * s)
{
int which = 0;
if(c > 1)
{
//线程数
N = atoi(s[1]);
if(N > 20 || N <= 1) N = 10;
}
if(c > 2)
{
//which func?
which = atoi(s[2]);
if(which > 2 || which < 0) which = 0;
}
if(c > 3)
{
//delay param
DELAY_CNT = atoi(s[3]);
if(DELAY_CNT > 10000 || DELAY_CNT < 0) DELAY_CNT= 100;
}

pthread_t id[N];
int args[N];
int i = 0;
void * (*fun)(void*) = { fun1,fun2,fun3};
pthread_spin_init(&splock,0);
NR = TOTAL / N;

for(;i<N;++i){
args = i;
pthread_create(&id,NULL,fun[which],&args);
}

for(i=0;i<N;++i){
printf("join thread:%d\n", i);
pthread_join(id,NULL);
printf("join thread:%d done\n", i);
}

printf("cnt = %d, should be %d\n",cnt, N * NR);
return 0;
}

===============================================
先前的更新仅仅做为参考

更新
重新修改了我的实现，加入了放弃时间片的情况，测试结果，几乎是mutex的2-3倍效率
real 0m0.431s
user 0m0.604s
sys 0m0.240s

我想这个应该就会是我理想中的最终版本了，起码可以抛弃 pthread 库的mutex实现一些简单的加锁的功能。

代码：

#ifndef MY_SPINLOCK_H
#define MY_SPINLOCK_H

#include<stdint.h>
#include<unistd.h>

typedef volatile uint32_t my_spinlock_t;

#define MY_SPINLOCK_INITIALIZER 0
#define DELAY_NR 10000

static uint32_t bar = 13;
static uint32_t *foo = &bar;

#define do_hash(a) do{  \
(a) = ((a)+0x7ed55d16) + ((a)<<12); \
(a) = ((a)^0xc761c23c) ^ ((a)>>19); \
(a) = ((a)+0x165667b1) + ((a)<<5);  \
(a) = ((a)+0xd3a2646c) ^ ((a)<<9);  \
(a) = ((a)+0xfd7046c5) + ((a)<<3);  \
(a) = ((a)^0xb55a4f09) ^ ((a)>>16); \
}while(0)

#define my_spinlock_lock(lock) do{  \
while(!__sync_bool_compare_and_swap(lock, 0, 1)) \
{ \
      while(*lock) \
      { \
         do_hash(*foo);  \
         if((*foo % 11) == 1) \
            sched_yield();  \
      } \
} \
}while(0)

#define my_spinlock_unlock(lock) do{ \
*lock = 0;  \
}while(0)

#endif

=======================================

最近在研究原子操作，按网上一些资料实现了个自旋锁
拿来和 posix 的mutex，spinlock 一起测，结果出乎我意料。
mutex的成绩非常好，我自己实现的稍微差点，posix 的pthread_spinlock_t的结果比较差。
这个真没想到，mutex的效率这么高，看到这个结果我都觉得不相信自己的眼睛了
还是印证了，不要靠自己感觉，实际数据才是最真实的。

谁能解释一下，谢谢~

环境：
uname -a
Linux bsd02 2.6.35.9 #1 SMP Tue Jan 11 02:09:50 EST 2011 x86_64 GNU/Linux
双核 Pentium(R) Dual-Core  CPU    E5400  @ 2.70GHz

并发20个线程测试，结果:
我的实现：
real 0m1.659s
user 0m3.276s
sys    0m0.000s

mutex：
real 0m1.481s
user 0m1.164s
sys    0m1.764s

pthread spinlock:
real 0m6.171s
user 0m12.301s
sys    0m0.004s

本篇来自: 百家学院 ( http://www.9php.com ),获取更多同类文章请访问以上网站.