死锁是什么
两个或多个进程/线程在等待对方占用的资源,结果都无法往下继续执行。
死锁的模拟
启动四个线程t1,t2,t3,t4,各自锁上资源A,B,C,D,然后t1锁B,t2锁C,t3锁D,t4锁A。
死锁的检测
由上图可以看出,线程对资源的请求形成了一个环路,所以判断是否存在死锁可以转化为判断是否有资源环路。这里将互斥量作为竞争的资源。
这里数据结构使用有向邻接表。
在将死锁检测转换为有向邻接表判断有环前,先介绍一下邻接表。
邻接表
邻接矩阵当边相对于顶点较少的时(稀疏图)候会浪费很多存储空间,这里使用邻接表。
然后用DFS判断有向邻接表是否有环。
将死锁检测转换为有向邻接表判断有环
邻接表顶点数组里存储的是线程资源:包括线程ID。
线程t1想要对已经被t2占用的互斥量mtx2加锁就添加一条t1->t2的弧到t1的弧链表。线程t1获取到mtx2时(此时线程2早已解锁)则删除这条弧。
然后新开一个线程定时检测邻接表是否有环,有则将环打印出来。
根据上面的流程提出三个原语操作:加锁前,加锁后,解锁后。
- 加锁前
上锁前如果有其它线程在占用锁,则添加1条弧 - 加锁后
加锁后如果锁数组没有mtx,则添加
如果有mtx,说明之前一直阻塞等待锁,其它线程刚刚释放锁,本线程获取锁,删除弧,修改锁线程ID - 解锁后
解锁后如果mtx没有线程在等待,则从锁数组移除
hook
用hook来在加锁,解锁调用前后添加自定义代码。
typedef int (*pthread_mutex_lock_symbol)(pthread_mutex_t *mtx);
pthread_mutex_lock_symbol pthread_mutex_lock_s;
pthread_mutex_lock_symbol pthread_mutex_unlock_s;
void init_hook() {
pthread_mutex_lock_s = dlsym(RTLD_NEXT, "pthread_mutex_lock");
if (!pthread_mutex_lock_s) {
printf("dlsym failed: %s\n", dlerror());
}
pthread_mutex_unlock_s = dlsym(RTLD_NEXT, "pthread_mutex_unlock");
if (!pthread_mutex_unlock_s) {
printf("dlsym 2 failed: %s\n", dlerror());
}
}
int pthread_mutex_lock(pthread_mutex_t *mtx) {
printf("lock %p, thread %lu\n", mtx, pthread_self());
do_before_lock(mtx);
pthread_mutex_lock_s(mtx);
do_after_lock(mtx);
}
int pthread_mutex_unlock(pthread_mutex_t *mtx) {
printf("unlock %p, thread %lu\n", mtx, pthread_self());
pthread_mutex_unlock_s(mtx);
do_after_unlock(mtx);
}
三个锁原语操作
/**
* 上锁前如果有其它线程在占用锁,则添加1条弧
*/
void do_before_lock(pthread_mutex_t *mtx) {
int lindex = get_lock_data_index(mtx);
// 该锁被占用
if (-1 != lindex) {
data_t head;
head.tid = pthread_self();
add_vertex(head);
data_t tail;
tail.tid = al.ldarr[lindex].tid;
add_vertex(tail);
if (!is_arc_exist(head, tail)) {
add_arc(head, tail);
// TODO 可以在这里检测是否有环,有则打印存在死锁
}
++al.ldarr[lindex].use_count;
}
}
/**
* 加锁后如果锁数组没有mtx,则添加
* 如果有mtx,说明之前一直阻塞等待锁,其它线程刚刚释放锁,本线程获取锁,删除弧,修改锁线程ID
*/
void do_after_lock(pthread_mutex_t *mtx) {
int lindex = get_lock_data_index(mtx);
if (-1 == lindex) {
data_t tail;
tail.tid = pthread_self();
tail.lock_addr = mtx;
tail.use_count = 0;
al.ldarr[get_unused_lock_data_index()] = tail;
}
else {
pthread_t tid = pthread_self();
int vindex = find_vertex(tid);
if (-1 == vindex)
{
return;
}
data_t head = al.varr[vindex].d;
data_t tail = al.ldarr[lindex];
if (is_arc_exist(head, tail)) {
remove_arc(head, tail);
}
--al.ldarr[lindex].use_count;
// 忘了
al.ldarr[lindex].tid = tid;
}
}
/**
* 解锁后如果mtx没有线程在等待,则从锁数组移除
*/
void do_after_unlock(pthread_mutex_t *mtx) {
int lindex = get_lock_data_index(mtx);
if (-1 == lindex) {
return;
}
if (0 == al.ldarr[lindex].use_count) {
al.ldarr[lindex].lock_addr = NULL;
al.ldarr[lindex].tid = 0;
}
}
启动线程定时检测环
void* checkDeadlockRoutine(void *arg) {
while (1) {
sleep(3);
int i;
for (i = 0; i < al.vnum; ++i) {
if (1 == has_deadlock) {
break;
}
checkGraphCycle(i);
}
}
return NULL;
}
void checkGraphCycle(int idx) {
int i;
k = 0;
path[k++] = idx;
visited[idx] = 1;
vertex_t *p = al.varr[i].next;
// 遍历邻接点
while (p) {
for (i = 0; i < al.vnum; ++i) {
if (i != idx) {
visited[i] = 0;
}
}
DFS(p);
--k;
p = p->next;
}
}
void DFS(vertex_t *p) {
int idx = find_vertex(p->d.tid);
if (-1 == idx) {
return;
}
path[k++] = idx;
if (1 == visited[idx]) {
has_deadlock = 1;
printf("存在死锁\n");
printCycle();
return;
}
visited[idx] = 1;
// 这里一个线程最多只能阻塞在一个pthread_mutex_lock上,所以一个线程最多一条弧,不会出现一个线程有多条弧的情况
vertex_t *q = al.varr[idx].next;
while (q) {
DFS(q);
--k;
q = q->next;
}
}
完整代码
/**
* 死锁检测 -> 有向图成环
* 死锁的模拟
* HOOK, 三个原语操作:锁前,锁后,解锁后
* 有向邻接表的基本操作:添加/删除/查找顶点,添加/删除弧,判断弧是否存在
* 启动线程定时检测是否有环,DFS
*/
// long long是8个字节
// 32位 linux,指针和long是4字节
// 64位 linux,指针和long是8字节
#define _GNU_SOURCE // 启用RTLD_NEXT
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <pthread.h>
#include <dlfcn.h>
#define MAX_VERTEX 100
typedef struct vertex_s vertex_t;
typedef struct data_s data_t;
typedef struct adjlist_s adjlist_t;
typedef int (*pthread_mutex_lock_symbol)(pthread_mutex_t *mtx);
// 资源
struct data_s {
pthread_t tid; // 线程ID
void *lock_addr; // 互斥量地址,弧链表使用
int use_count; // 锁引用计数,弧链表使用
};
/**
* 这里顶点节点和弧节点用同一个结构体
* 顶点数组里的vertex_t是线程
* 弧链表里的vertex_t是资源(锁)
*/
struct vertex_s {
data_t d;
vertex_t *next;
};
// 有向邻接表
struct adjlist_s {
vertex_t varr[MAX_VERTEX]; // 顶点数组(线程)
int vnum; // 顶点数量
data_t ldarr[MAX_VERTEX]; // 加锁的资源
int ldindex; // data数组最大下标
// TODO 需要对两个数组加锁
#if 0
pthread_mutex_t mtx;
#endif
};
adjlist_t al = {0};
// 节点是否被访问过
int visited[MAX_VERTEX] = {0};
int has_deadlock = 0;
// 死锁路径的下标数组
int path[MAX_VERTEX] = {0};
// 死锁路径最大下标+1
int k = 0;
pthread_mutex_lock_symbol pthread_mutex_lock_s;
pthread_mutex_lock_symbol pthread_mutex_unlock_s;
// 在顶点数组查找顶点
int find_vertex(pthread_t tid) {
int i;
for (i = 0; i < al.vnum; ++i) {
if (al.varr[i].d.tid == tid) {
return i;
}
}
return -1;
}
// 在顶点数组添加顶点
void add_vertex(data_t d) {
int index = find_vertex(d.tid);
if (-1 == index) {
al.varr[al.vnum].d = d;
al.varr[al.vnum].next = NULL;
++al.vnum;
}
}
// 创建链表中的顶点
vertex_t* create_vertex(data_t d) {
vertex_t* v = (vertex_t*)calloc(1, sizeof(vertex_t));
if (!v) {
printf("calloc failed\n");
return NULL;
}
v->d = d;
v->next = NULL;
return v;
}
// 添加弧
void add_arc(data_t head, data_t tail) {
add_vertex(head);
add_vertex(tail);
int index = find_vertex(head.tid);
if (-1 == index) {
return;
}
vertex_t* p = &al.varr[index];
while (p->next) {
p = p->next;
}
p->next = create_vertex(tail);
}
// 删除弧
void remove_arc(data_t head, data_t tail) {
int hindex = find_vertex(head.tid);
int tindex = find_vertex(tail.tid);
if (-1 == hindex || -1 == tindex) {
printf("-1 == hindex || -1 == tindex\n");
return;
}
vertex_t* p = &al.varr[hindex];
while (p->next) {
if (p->next->d.tid == tail.tid) {
vertex_t* tmp = p->next;
p->next = tmp->next;
free(tmp);
return;
}
p = p->next;
}
}
// 弧是否存在
int is_arc_exist(data_t head, data_t tail) {
int index = find_vertex(head.tid);
if (-1 == index) {
return 0;
}
vertex_t* p = &al.varr[index];
while (p->next) {
if (p->next->d.tid == tail.tid) {
return 1;
}
p = p->next;
}
return 0;
}
void init_hook() {
pthread_mutex_lock_s = dlsym(RTLD_NEXT, "pthread_mutex_lock");
if (!pthread_mutex_lock_s) {
printf("dlsym failed: %s\n", dlerror());
}
pthread_mutex_unlock_s = dlsym(RTLD_NEXT, "pthread_mutex_unlock");
if (!pthread_mutex_unlock_s) {
printf("dlsym 2 failed: %s\n", dlerror());
}
}
int inc(int *value, int add) {
int old;
__asm__ volatile(
"lock;xaddl %2, %1;"
: "=a"(old)
: "m"(*value), "a" (add)
: "cc", "memory"
);
return old;
}
int get_lock_data_index(pthread_mutex_t *mtx) {
int i;
for (i = 0; i < al.ldindex; ++i) {
if (al.ldarr[i].lock_addr == mtx) {
return i;
}
}
return -1;
}
// 这里容易出错
// 当查找不到锁地址,索引加1
int get_unused_lock_data_index() {
int i;
for (i = 0; i < MAX_VERTEX; ++i) {
if (NULL == al.ldarr[i].lock_addr) {
#if 1
// BUG
// if (i > al.ldindex) {
// al.ldindex = i;
// }
#if 1
inc(&al.ldindex, 1);
#else
++al.ldindex;
#endif
#endif
return i;
}
}
return MAX_VERTEX;
}
/**
* 上锁前如果有其它线程在占用锁,则添加1条弧
*/
void do_before_lock(pthread_mutex_t *mtx) {
int lindex = get_lock_data_index(mtx);
// 该锁被占用
if (-1 != lindex) {
data_t head;
head.tid = pthread_self();
add_vertex(head);
data_t tail;
tail.tid = al.ldarr[lindex].tid;
add_vertex(tail);
if (!is_arc_exist(head, tail)) {
add_arc(head, tail);
// TODO 可以在这里检测是否有环,有则打印存在死锁
}
++al.ldarr[lindex].use_count;
}
}
/**
* 加锁后如果锁数组没有mtx,则添加
* 如果有mtx,说明之前一直阻塞等待锁,其它线程刚刚释放锁,本线程获取锁,删除弧,修改锁线程ID
*/
void do_after_lock(pthread_mutex_t *mtx) {
int lindex = get_lock_data_index(mtx);
if (-1 == lindex) {
data_t tail;
tail.tid = pthread_self();
tail.lock_addr = mtx;
tail.use_count = 0;
al.ldarr[get_unused_lock_data_index()] = tail;
}
else {
pthread_t tid = pthread_self();
int vindex = find_vertex(tid);
if (-1 == vindex)
{
return;
}
data_t head = al.varr[vindex].d;
data_t tail = al.ldarr[lindex];
if (is_arc_exist(head, tail)) {
remove_arc(head, tail);
}
--al.ldarr[lindex].use_count;
// 忘了
al.ldarr[lindex].tid = tid;
}
}
/**
* 解锁后如果mtx没有线程在等待,则从锁数组移除
*/
void do_after_unlock(pthread_mutex_t *mtx) {
int lindex = get_lock_data_index(mtx);
if (-1 == lindex) {
return;
}
if (0 == al.ldarr[lindex].use_count) {
al.ldarr[lindex].lock_addr = NULL;
al.ldarr[lindex].tid = 0;
}
}
int pthread_mutex_lock(pthread_mutex_t *mtx) {
printf("lock %p, thread %lu\n", mtx, pthread_self());
do_before_lock(mtx);
pthread_mutex_lock_s(mtx);
do_after_lock(mtx);
}
int pthread_mutex_unlock(pthread_mutex_t *mtx) {
printf("unlock %p, thread %lu\n", mtx, pthread_self());
pthread_mutex_unlock_s(mtx);
do_after_unlock(mtx);
}
void printCycle() {
int i;
#if 0
printf("path[]: ");
for (i = 0; i < k; ++i) {
printf("%d ", path[i]);
}
#else
int idx;
if (k < 1) {
return;
}
for (i = 0; i < k - 1; ++i) {
idx = path[i];
printf("thread %lu ->", al.varr[idx].d.tid);
}
idx = path[i];
printf("thread %lu\n", al.varr[idx].d.tid);
#endif
}
void DFS(vertex_t *p) {
int idx = find_vertex(p->d.tid);
if (-1 == idx) {
return;
}
path[k++] = idx;
if (1 == visited[idx]) {
has_deadlock = 1;
printf("存在死锁\n");
printCycle();
return;
}
visited[idx] = 1;
// 这里一个线程最多只能阻塞在一个pthread_mutex_lock上,所以一个线程最多一条弧,不会出现一个线程有多条弧的情况
vertex_t *q = al.varr[idx].next;
while (q) {
DFS(q);
--k;
q = q->next;
}
}
void checkGraphCycle(int idx) {
int i;
k = 0;
path[k++] = idx;
visited[idx] = 1;
vertex_t *p = al.varr[i].next;
// 遍历邻接点
while (p) {
for (i = 0; i < al.vnum; ++i) {
if (i != idx) {
visited[i] = 0;
}
}
DFS(p);
--k;
p = p->next;
}
}
void* checkDeadlockRoutine(void *arg) {
while (1) {
sleep(3);
int i;
for (i = 0; i < al.vnum; ++i) {
if (1 == has_deadlock) {
break;
}
checkGraphCycle(i);
}
}
return NULL;
}
void checkDeadlock() {
pthread_t tid;
pthread_create(&tid, NULL, checkDeadlockRoutine, NULL);
// 需要detach吗?
// pthread_detach(tid);
}
static pthread_mutex_t mtx1 = PTHREAD_MUTEX_INITIALIZER;
static pthread_mutex_t mtx2 = PTHREAD_MUTEX_INITIALIZER;
static pthread_mutex_t mtx3 = PTHREAD_MUTEX_INITIALIZER;
static pthread_mutex_t mtx4 = PTHREAD_MUTEX_INITIALIZER;
void* worker1(void *arg) {
printf("thread1 %lu\n", pthread_self());
pthread_mutex_lock(&mtx1);
// 保证4个线程先各自获取到一把锁
sleep(1);
pthread_mutex_lock(&mtx2);
pthread_mutex_unlock(&mtx2);
pthread_mutex_unlock(&mtx1);
}
void* worker2(void *arg) {
printf("thread2 %lu\n", pthread_self());
pthread_mutex_lock(&mtx2);
sleep(1);
pthread_mutex_lock(&mtx3);
pthread_mutex_unlock(&mtx3);
pthread_mutex_unlock(&mtx2);
}
void* worker3(void *arg) {
printf("thread3 %lu\n", pthread_self());
pthread_mutex_lock(&mtx3);
sleep(1);
pthread_mutex_lock(&mtx4);
pthread_mutex_unlock(&mtx4);
pthread_mutex_unlock(&mtx3);
}
void* worker4(void *arg) {
printf("thread4 %lu\n", pthread_self());
pthread_mutex_lock(&mtx4);
sleep(1);
pthread_mutex_lock(&mtx1);
pthread_mutex_unlock(&mtx1);
pthread_mutex_unlock(&mtx4);
}
int main() {
init_hook();
checkDeadlock();
pthread_t t1, t2, t3, t4;
pthread_create(&t1, NULL, worker1, NULL);
pthread_create(&t2, NULL, worker2, NULL);
pthread_create(&t3, NULL, worker3, NULL);
pthread_create(&t4, NULL, worker4, NULL);
pthread_join(t1, NULL);
pthread_join(t2, NULL);
pthread_join(t3, NULL);
pthread_join(t4, NULL);
return 0;
}
运行结果
可以看到线程4想要对mtx1加锁,线程1想要对mtx2加锁,线程2想要对mtx3加锁,线程3想要对mtx4加锁,形成一个环。
由此证明有死锁。