在监护进程中,创建一个socket,负责监听来自客户端的信息;
int init_socket(void )
{
int ser_sockfd;
int backlog ,server_len;
struct sockaddr_un server_sockaddr ;
ser_sockfd = socket( AF_UNIX, SOCK_STREAM, 0 ) ;
server_sockaddr.sun_family = AF_UNIX ;
strncpy( server_sockaddr.sun_path, UNIX_DOMAIN,sizeof(server_sockaddr.sun_path) - 1 ) ;
server_len=sizeof(server_sockaddr);
if(-1 == bind( ser_sockfd, ( struct sockaddr * )&server_sockaddr, server_len ) )
{
perror("Bind");
return -1;
}
backlog = 5 ;
if( -1 == listen( ser_sockfd, backlog ))
{
perror("Listen");
return -1;
}
printf("SERVER::Server is waitting on socket=%d \n",ser_sockfd);
return ser_sockfd;
}
在被监护的进程中,创建一个client线程,负责向监护进程的socket发送信息;
int client_main(void)
{
struct sockaddr_un srv_addr;
char rcv_buf[132],send_buf[132],mac_addr[32];
int ret;//,rcv_num;
memset((char *)&client, 0, sizeof(client_info_t));
//create client socket
client.sock_fd = socket(PF_UNIX, SOCK_STREAM, 0);
client.cmd = TELL_SERVER_TICK_VOIP;
if(client.sock_fd < 0)
{
client.connect_status = -1;
perror("client create socket failed");
return -1;
}
//set server sockaddr_un
srv_addr.sun_family = AF_UNIX;
strcpy(srv_addr.sun_path, UNIX_DOMAIN);
//connect to server
ret = connect(client.sock_fd, (struct sockaddr*)&srv_addr, sizeof(srv_addr));
if(ret == -1)
{
client.connect_status = -1;
perror("connect to server failed!");
close(client.sock_fd);
unlink(UNIX_DOMAIN);
return -1;
}
client.connect_status = 1;
//receive message from server
memset(rcv_buf, 0, 132);
memset(send_buf, 0, 132);
memset(mac_addr, 0, 32);
//send message to server
GetNetIfEthaddr(mac_addr);
sprintf(send_buf,"Send to Server:MAC:%s\r\n",mac_addr);
write(client.sock_fd,send_buf,strlen(send_buf));
sleep(1);
write_msg(TELL_SERVER_START_VOIP);
return 0;
}
监护进程循环处理客户端发来的消息(select函数),最长等待时间为6秒,如果6秒仍没有消息,则记录一次超时,然后进入下一次循环。当超时达到一定时长,则认为被监护进程已经死掉了,或者进入了死循环(轮不到client线程跑),此时,根据/var/run中的pid文件,去/proc/中检查是否还有该进程来判断进程是死掉了,还是进入了死循环,进而重启进程或者重启系统。
监护进程除了防止被监护进程死循环,死掉之外,还有喂狗的操作。每次处理客户端消息之前,都会喂一次狗,这个操作是为了防止监护进程死掉,或者是内核崩溃,内核进程阻塞等,一旦发生上述任一情况,监护进程就不再喂狗,系统最终会被看门狗重启。
综上,系统整体的监护机制就是监护进程监护系统主进程,看门狗监护 监护进程和系统内核,借此以保障系统的稳定。
监护进程信息处理部分源码如下
while (1)
{
tv.tv_sec = 6;
tv.tv_usec = 0;
catch_fd_set=watchset;
watchdog_set(watchdog_fd);
rcd = select( maxfd+1, &catch_fd_set, NULL, NULL, (struct timeval *)&tv ) ;
switch(rcd)
{
case -1:
{
printf("SERVER::Server 5 \n");
exit(1);
}
case 0:
{
//process dead
if(0 > get_pid_status(voip_pidfile))
{
time_out_count++;
if(time_out_count > 3) //3*6
{
time_out_count = 0;
printf("No voip Timeout\n");
restart_app_detect((client_info_t*)&client,1);
}
}
//process dead loop
else
{
timeout_no_ack++;
if(timeout_no_ack > 20) //20*6
{
printf("voip Infinite loop \n");
timeout_no_ack = 0;
restart_app_detect((client_info_t*)&client,1);
}
}
continue;
}
default:
{
break;
}
}
}