在程序中设计,watchdog主要是为了监控程序,或者监控系统状态,或者间隔一定时间,触发某种操作。Bacula中设计的watchdog,在dird中是为了监控用户是否应该下线
(即很长时间,没有动作,就应被迫下线,避免占用服务器过多资源)
以下是watchdog.h
(在原有Bacula中进行了一些函数改动,思想思路未变,便于独立分析)
#ifndef _WATCHDOG_H
#define _WATCHDOG_H
typedef struct s_watchdog_t { // 监控结构体
bool one_shot; // 是否执行一次
time_t interval;// 执行间隔时间
time_t next_fire;// 下一次执行时间
void (*callback) (struct s_watchdog_t *wd);// 每次执行时,调用的函数
void (*destructor)(struct s_watchdog_t *wd); // 监控退出时,调用的函数
void *data; // 额外需要保存的数据
Link link; // 链表,如果多个事件需要监控的话,事所有的监控事件形成一个链表
}watchdog_t;
bool watchdog_start(void); // 初始化监控器(全局函数)
void watchdog_ping(); // 每次监控器链表发生变化,都应发出signal 信号,触发事件检查及执行
bool watchdog_stop(void) ; // 监控器退出
watchdog_t *watchdog_init(bool oneshot,time_t internal_,void (*callback)(watchdog_t *),void (*destructor)(watchdog_t *),void *data) ; // 初始化一个监控体
bool watchdog_register(watchdog_t *wd) ;// 在监控器链表中注册一个监控器
bool watchdog_unregister(watchdog_t *wd); //撤销
void *watchdog_thread(void *arg); // 监控器具体执行的操作函数(作为一个新线程)
#endif
watchdog.c 文件,主要的实现都在此文件中
#include "head.h"
/* Exported globals */
static time_t watchdog_time = 0; /* this has granularity of SLEEP_TIME */
static time_t watchdog_sleep_time = 60; /* examine things every 60 seconds */
/* Locals */
static pthread_mutex_t timer_mutex = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t timer = PTHREAD_COND_INITIALIZER;
/* Static globals */
static bool quit = FALSE;
static bool wd_is_init = FALSE;
static pthread_mutex_t wd_lock = PTHREAD_MUTEX_INITIALIZER;/* watchdog lock */
static pthread_t wd_tid;
static List *wd_queue;
static List *wd_inactive;
/*
* watchdog init
*
* Returns: TRUE on success
* FALSE on failure
*/
bool watchdog_start(void) {
watchdog_t *dummy = NULL;
if (wd_is_init) {
return TRUE;
}
watchdog_time = time(NULL);
wd_queue = list_init(dummy,&dummy->link);
wd_inactive =list_init(dummy,&dummy->link);;
if (pthread_create(&wd_tid, NULL, watchdog_thread, NULL) != 0) {
return FALSE;
}
wd_is_init = TRUE;
return TRUE;
}
/*
* Wake watchdog timer thread so that it walks the
* queue and adjusts its wait time (or exits).
*/
void watchdog_ping() {
P(&timer_mutex);
pthread_cond_signal(&timer);
V(&timer_mutex);
}
/*
* Terminate the watchdog thread
*
* Returns: TURE on success
* FALSE on failure
*/
bool watchdog_stop(void)
{
watchdog_t *p;
if (!wd_is_init) {
return TRUE;
}
quit = TRUE; /* notify watchdog thread to stop */
wd_is_init = FALSE;
watchdog_ping();
pthread_join(wd_tid, NULL);
while (!list_empty(wd_queue)) {
void *item = list_first(wd_queue);
list_remove(wd_queue,item);
p = (watchdog_t *) item;
if (p->destructor != NULL) {
p->destructor(p); /* make operation at end */
}
free(p);
}
list_destroy(wd_queue);
wd_queue = NULL;
while (!list_empty(wd_inactive)) {
void *item = list_first(wd_inactive);
list_remove(wd_inactive,item);
p = (watchdog_t *) item;
if (p->destructor != NULL) {
p->destructor(p);
}
free(p);
}
list_destroy(wd_inactive);
wd_inactive = NULL;
return TRUE;
}
watchdog_t *watchdog_init(bool oneshot,time_t internal_,void (*callback)(watchdog_t *wd),void (*destructor)(watchdog_t *wd),void *data)
{
watchdog_t *wd = NULL;
if (!wd_is_init) {
watchdog_start();
}
P(&wd_lock); /* search a watchdog_t object from inactive list */
wd = (watchdog_t *)list_first(wd_inactive);
if (wd) {
list_remove(wd_inactive,wd);
V(&wd_lock);
}
else{
V(&wd_lock);
/* if no free ,new */
wd = (watchdog_t *) malloc(sizeof(watchdog_t));
if (wd == NULL) {
return NULL;
}
}
/* here should judge whether the type is valid
*
*/
wd->one_shot = oneshot; /* default, operate only once */
wd->interval = internal_;
wd->next_fire=0;
wd->callback = callback;
wd->destructor = destructor;
wd->data = data;
return wd;
}
bool watchdog_register(watchdog_t *wd) {/* start to operation */
if (!wd_is_init) {
printf("BUG! register_watchdog called before start_watchdog\n");
return FALSE;
}
if (wd->callback == NULL) {
printf ("BUG! Watchdog has NULL callback\n");
return FALSE;
}
if (wd->interval == 0) {
printf("BUG! Watchdog has zero interval\n");
}
P(&wd_lock);
wd->next_fire = time(NULL) + wd->interval;
list_append(wd_queue,wd);
V(&wd_lock);
watchdog_ping();
return TRUE;
}
bool watchdog_unregister(watchdog_t *wd) {
watchdog_t *p;
bool ok = FALSE;
if (!wd_is_init) {
printf("BUG! register_watchdog called before start_watchdog\n");
return FALSE;
}
P(&wd_lock);
foreach_list(p, wd_queue)
{
if (wd == p) {
list_remove(wd_queue,wd);
ok = TRUE;
goto get_out;
}
}
foreach_list(p, wd_inactive) {
if (wd == p) {
list_remove(wd_inactive,wd);
ok = TRUE;
goto get_out;
}
}
get_out:
V(&wd_lock);
watchdog_ping();
return ok;
}
/*
* This is the thread that walks the watchdog queue
* and when a queue item fires, the callback is
* invoked. If it is a one shot, the queue item
* is moved to the inactive queue.
*/
void *watchdog_thread(void *arg)
{
/*pthread_detach(pthread_self());*/
struct timespec timeout;
struct timeval tv;
struct timezone tz;
time_t next_time;
watchdog_t *p;
while (!quit)
{
/*
*
* NOTE. lock_jcr_chain removed, but the message below
* was left until we are sure there are no deadlocks.
*
* We lock the jcr chain here because a good number of the
* callback routines lock the jcr chain. We need to lock
* it here *before* the watchdog lock because the SD message
* thread first locks the jcr chain, then when closing the
* job locks the watchdog chain. If the two threads do not
* lock in the same order, we get a deadlock -- each holds
* the other's needed lock.
*/
P(&wd_lock);
walk_list:
watchdog_time = time(NULL);
next_time = watchdog_time + watchdog_sleep_time;
foreach_list(p, wd_queue)
{
if (p->next_fire <= watchdog_time)
{
/* Run the callback */
p->callback(p); /*时间到,开始执行,有待改进*/
/* Reschedule (or move to inactive list if it's a one-shot timer) */
if (p->one_shot) {
list_remove(wd_queue,p);
list_append(wd_inactive,p);
goto walk_list;
}
else {
p->next_fire = watchdog_time + p->interval;
}
}
if (p->next_fire <= next_time) {
next_time = p->next_fire;
}
}
V(&wd_lock);
/*
* Wait sleep time or until someone wakes us
*/
gettimeofday(&tv, &tz);
timeout.tv_nsec = tv.tv_usec * 1000;
timeout.tv_sec = tv.tv_sec + next_time - time(NULL);
/* Note, this unlocks mutex during the sleep */
P(&timer_mutex);
pthread_cond_timedwait(&timer, &timer_mutex, &timeout);
V(&timer_mutex);
}
return NULL;
}
测试程序
#define WATCHDOG_TEST
#ifdef WATCHDOG_TEST
void start (watchdog_t *wd)
{
int i=*(int *)wd->data;
printf("thread %d start,internal=%ld,next=%ld\n",i,wd->interval,wd->next_fire);
}
void end (watchdog_t *wd)
{
int i=*(int *)wd->data;
printf("thread %d end\n",i);
}
void *thread_excute(void * arg)
{
watchdog_t * wd;
printf("%d\n",*(int *)arg);
wd=watchdog_init(FALSE,2,start,end,arg);
watchdog_register(wd);
}
int main()
{
pthread_t tid[5];
int a[5]={1,2,3,4,5};
int i;
watchdog_start();
for(i=0;i<5;i++)
pthread_create(&tid[i],NULL,thread_excute,(void *)&a[i]);
sleep(60);
watchdog_stop();
}
#endif