Posix的线程终止有两种情况:正常终止和非正常终止。线程主动调用pthread_exit()或者从线程函数中return都将使线程正常退出,这是可预见的退出方式;非正常终止是线程在其他线程的干预下,或者由于自身运行出错(比如访问非法地址)而退出,比如pthreead_cancel,这种退出方式是不可预见的。不论是可预见的线程终止还是异常终止,都会存在资源释放的问题,在不考虑因运行出错而退出的前提下,如何保证线程终止时能顺利的释放掉自己所占用的资源,特别是锁资源,就是一个必须考虑解决的问题。
最经常出现的情形是资源独占锁的使用:线程为了访问临界资源而为其加上锁,但在访问过程中被外界取消,如果线程处于响应取消状态,且采用异步方式响应,或者在打开独占锁以前的运行路径上存在取消点,则该临界资源将永远处于锁定状态得不到释放。外界取消操作是不可预见的,因此的确需要一个机制来简化用于资源释放的编程。
POSIX中的函数cancellation点的:
pthread_join
pthread_cond_wait
thread_cond_timewait
pthread_testcancel
sem_wait
sigwait 都是cancellation点.
下面的这些系统函数也是cancellation点:
accept
fcntl
open
read
write
lseek
close
send
sendmsg
sendto
connect
recv
recvfrom
recvmsg
system
tcdrain
fsync
msync
pause
wait
waitpid
nanosleep
当其他线程调用pthreead_cancel都会让本线程在这些函数后退出线程。
默认测试代码如下:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
#include <sys/syscall.h>
pthread_mutex_t mutexA;
int thStop = 0;
int is_safemode = 0;
int is_safe_exit = 0;
int is_setcancle = 0;
int is_notify = 0;
void *thread_function1(void *arg)
{
pthread_t threadId = 0;
long int pid = getpid();
long int lwpId = syscall(SYS_gettid);
threadId = (pthread_t)(pthread_self());
printf("thread[0x%lx][%ld][%ld] in function1\n",threadId,lwpId,pid);
while(1)
{
printf("function1 owner:%ld waiting lock owner:%d ...\n",lwpId,mutexA.__data.__owner);
pthread_mutex_lock(&mutexA);
printf("function1 mutex:owner::%d;count::%d;lock:%d\n",
mutexA.__data.__owner,mutexA.__data.__count,mutexA.__data.__lock);
printf("I an thread[0x%lx][%ld] function1\n",threadId,lwpId);
sleep(1);
pthread_mutex_unlock(&mutexA);
sleep(1);
}
}
void clean_function2_res(void *arg)
{
int lwpid = (int)*((int *)arg);
if(!is_notify)
{
return;
}
printf("clean function2 res lwpid:%d\n",lwpid);
if(mutexA.__data.__owner == lwpid)
{
pthread_mutex_unlock(&mutexA);
printf("clean function2 res lock\n");
}
}
void *thread_function2(void *arg)
{
int oldstate = 0;
int waitCount = 0;
pthread_t threadId = 0;
long int pid = getpid();
int lwpId = syscall(SYS_gettid);
threadId = (pthread_t)(pthread_self());
printf("thread[0x%lx][%d][%ld] in function2\n",threadId,lwpId,pid);
pthread_cleanup_push(clean_function2_res,(void *)&lwpId);
while(1)
{
printf("function2 owner:%d waiting lock owner:%d ...\n",lwpId,mutexA.__data.__owner);
pthread_mutex_lock(&mutexA);
printf("function2 mutex:owner::%d;count::%d;lock:%d\n",
mutexA.__data.__owner,mutexA.__data.__count,mutexA.__data.__lock);
if(thStop)
{
while(1)
{
if((is_safemode) && (is_safe_exit))
{
break;
}
printf("waiting thread[0x%ld] cancel...\n",threadId);
usleep(500000);
if(is_setcancle)
{
waitCount ++;
pthread_setcancelstate(PTHREAD_CANCEL_DISABLE,&oldstate);
printf("pthread cancel oldstatue:%d;[%d]:[%d]\n",oldstate,PTHREAD_CANCEL_DISABLE,PTHREAD_CANCEL_ENABLE);
if(waitCount > 10)
{
printf("it will into cancel pthread point\n");
pthread_mutex_unlock(&mutexA);
sleep(1);
pthread_setcancelstate(PTHREAD_CANCEL_ENABLE,NULL);
//printf("waiting cancel point sleep\n");
//usleep(500000);
printf("waiting cancel testcancel point\n");
pthread_testcancel();
printf("test cancel point\n");
while(1)
{
printf("waiting cancel pthread...\n");
usleep(500000);
}
}
}
}
}
else
{
printf("I an thread[0x%lx][%d] function2\n",threadId,lwpId);
sleep(1);
}
pthread_mutex_unlock(&mutexA);
sleep(1);
if((is_safemode) && (is_safe_exit))
{
break;
}
}
if(is_safemode)
{
printf("exit pthread by safe mode\n");
pthread_exit(NULL);
}
pthread_cleanup_pop(0);
}
int main(int avgc,char **pp_argv)
{
pthread_t mthid = -1;
unsigned int count = 0;
int ret = -1;
int mode = 0;
if(avgc >= 2)
{
mode = atoi(pp_argv[1]);
}
switch(mode)
{
case 1:
is_notify = 1;
break;
case 2:
is_safemode = 1;
break;
case 3:
is_setcancle = 1;
break;
case 0:
default:
break;
}
printf("notify clean mode:%d\n",is_notify);
printf("safe mode:%d\n",is_safemode);
printf("set cancle mode:%d\n",is_setcancle);
is_safe_exit = 0;
thStop = 0;
pthread_mutex_init(&mutexA, NULL);
pthread_create(&mthid,NULL,thread_function1,NULL);
printf("create thread:0x%lx\n",mthid);
pthread_create(&mthid,NULL,thread_function2,NULL);
printf("create thread:0x%lx\n",mthid);
do{
sleep(1);
count ++;
printf("main thread count:%d...\n",count);
}while(count < 10);
thStop = 1;
sleep(3);
if(is_safemode)
{
is_safe_exit = 1;
}
else
{
pthread_cancel(mthid);
}
pthread_join(mthid,(void *)&ret);
while(1)
{
printf("main thread function...\n");
sleep(1);
}
pthread_mutex_destroy(&mutexA);
}
编译:gcc -g mylock.c -lpthread -o mylock
复现问题:./mylock 0 强制进入死锁环境;
主线程调用thStop = 1;让thread_function2进入lock状态,然后调用pthread_cancel(mthid);终止线程thread_function2 ,thread_function1因为thread_function2 的退出没有是否互斥锁导致无法获取互斥锁导致死锁停止运行;
解决方案1,注册线程清理回调
void pthread_cleanup_push(void (*routine) (void *), void *arg)
void pthread_cleanup_pop(int execute)
pthread_cleanup_push()/pthread_cleanup_pop()采用先入后出的栈结构管理,void routine(void *arg)函数在调用pthread_cleanup_push()时压入清理函数栈,多次对pthread_cleanup_push() 的调用将在清理函数栈中形成一个函数链;从pthread_cleanup_push的调用点到pthread_cleanup_pop之间的程序段中的终止动作(包括调用pthread_exit()、pthread_cancel和异常终止,不包括return)都将执行pthread_cleanup_push()所指定的清理函数。
运行结果参考 ./mylock 1
解决方案2,线程安全退出,外部线程不要采用pthread_cancel结束线程,而是采用通知方法,由本线程接受到消息或参数后释放资源安全退出,
运行结果参考 ./mylock 2
解决方案3,在安全公共资源取消线程对pthread_cancel的响应。
设置本线程对Cancel信号的反应,state有两种值:PTHREAD_CANCEL_ENABLE(缺省)和 PTHREAD_CANCEL_DISABLE,分别表示收到信号后设为CANCLED状态和忽略CANCEL信号继续运行;old_state如果不为 NULL则存入原来的Cancel状态以便恢复。
pthread_setcancelstate(PTHREAD_CANCEL_DISABLE,&oldstate);
/***free resource安全执行完代码***/
pthread_setcancelstate(PTHREAD_CANCEL_ENABLE,NULL);
设置取消点 pthread_testcancel,
运行结果参考 ./mylock 3
gdb 调试mutexA 数据:
#gdb ./mylock
(gdb) b thread_function1
(gdb) n
(gdb) ptype pthread_mutex_t
(gdb) p &mutexA
$4 = (pthread_mutex_t *) 0x602100 <mutexA>
(gdb) p {pthread_mutex_t} 0x602100
$5 = {__data = {__lock = 1, __count = 0, __owner = 10237, __nusers = 1, __kind = 0, __spins = 0, __elision = 0, __list = {__prev = 0x0,
__next = 0x0}}, __size = "\001\000\000\000\000\000\000\000\375'\000\000\001", '\000' <repeats 26 times>, __align = 1}
(gdb) p mutexA
(gdb) b thread_function2