在项目中规划使用到了zookeeper当做分布式锁和服务器间session共享,在预研的过程中有个问题,当使用zookeeper提供客户端API连接上zookeeper服务器之后,将zookeeper服务器直接关闭有可能会导致客户端CPU占用率达到满负荷,所以就单步调试跟了进去,发现问题出处在mt_adaptor.c的425行上。
函数代码清单:
#ifdef WIN32
unsigned __stdcall do_io( void * v)
#else
void *do_io(void *v)
#endif
{
zhandle_t *zh = (zhandle_t*)v;
#ifndef WIN32
struct pollfd fds[2];
struct adaptor_threads *adaptor_threads = zh->adaptor_priv;
api_prolog(zh);
notify_thread_ready(zh);
LOG_DEBUG(("started IO thread"));
fds[0].fd=adaptor_threads->self_pipe[0];
fds[0].events=POLLIN;
while(!zh->close_requested) {
struct timeval tv;
int fd;
int interest;
int timeout;
int maxfd=1;
int rc;
zookeeper_interest(zh, &fd, &interest, &tv);
if (fd != -1) {
fds[1].fd=fd;
fds[1].events=(interest&ZOOKEEPER_READ)?POLLIN:0;
fds[1].events|=(interest&ZOOKEEPER_WRITE)?POLLOUT:0;
maxfd=2;
}
timeout=tv.tv_sec * 1000 + (tv.tv_usec/1000);
poll(fds,maxfd,timeout);
if (fd != -1) {
interest=(fds[1].revents&POLLIN)?ZOOKEEPER_READ:0;
interest|=((fds[1].revents&POLLOUT)||(fds[1].revents&POLLHUP))?ZOOKEEPER_WRITE:0;
}
if(fds[0].revents&POLLIN){
// flush the pipe
char b[128];
while(read(adaptor_threads->self_pipe[0],b,sizeof(b))==sizeof(b)){}
}
#else
fd_set rfds, wfds, efds;
struct adaptor_threads *adaptor_threads = zh->adaptor_priv;
api_prolog(zh);
notify_thread_ready(zh);
LOG_DEBUG(("started IO thread"));
FD_ZERO(&rfds); FD_ZERO(&wfds); FD_ZERO(&efds);
while(!zh->close_requested) {
struct timeval tv;
SOCKET fd;
SOCKET maxfd=adaptor_threads->self_pipe[0];
int interest;
int rc;
zookeeper_interest(zh, &fd, &interest, &tv);
if (fd != -1) {
if (interest&ZOOKEEPER_READ) {
FD_SET(fd, &rfds);
} else {
FD_CLR(fd, &rfds);
}
if (interest&ZOOKEEPER_WRITE) {
FD_SET(fd, &wfds);
} else {
FD_CLR(fd, &wfds);
}
}
FD_SET( adaptor_threads->self_pipe[0] ,&rfds );
rc = select((int)maxfd, &rfds, &wfds, &efds, &tv);
if (fd != -1)
{
interest = (FD_ISSET(fd, &rfds))? ZOOKEEPER_READ:0;
interest|= (FD_ISSET(fd, &wfds))? ZOOKEEPER_WRITE:0;
}
if (FD_ISSET(adaptor_threads->self_pipe[0], &rfds)){
// flush the pipe/socket
char b[128];
int read_count = 0;
while((read_count = recv(adaptor_threads->self_pipe[0],b,sizeof(b), 0))==sizeof(b)){}
}
#endif
// dispatch zookeeper events
rc = zookeeper_process(zh, interest);
// check the current state of the zhandle and terminate
// if it is_unrecoverable()
if(is_unrecoverable(zh))
break;
}
api_epilog(zh, 0);
LOG_DEBUG(("IO thread terminated"));
return 0;
}
问题在这条代码上
rc = select((int)maxfd, &rfds, &wfds, &efds, &tv);
由于rfds、wfds和efds都已经失效,因此select不会被阻塞,就相当进入了一个死循环,才导致CPU过高的。因此要在socket失效后,将rfds、wfds和efds置0,可防止此类问题出现。
增加代码:
if (read_count == -1)
{
LOG_ERROR(("由于通讯失效,重置检查描述字集."));
FD_ZERO(&rfds);
FD_ZERO(&wfds);
FD_ZERO(&efds);
}
修改后的代码清单:
#ifdef WIN32
unsigned __stdcall do_io( void * v)
#else
void *do_io(void *v)
#endif
{
zhandle_t *zh = (zhandle_t*)v;
#ifndef WIN32
struct pollfd fds[2];
struct adaptor_threads *adaptor_threads = zh->adaptor_priv;
api_prolog(zh);
notify_thread_ready(zh);
LOG_DEBUG(("started IO thread"));
fds[0].fd=adaptor_threads->self_pipe[0];
fds[0].events=POLLIN;
while(!zh->close_requested) {
struct timeval tv;
int fd;
int interest;
int timeout;
int maxfd=1;
int rc;
zookeeper_interest(zh, &fd, &interest, &tv);
if (fd != -1) {
fds[1].fd=fd;
fds[1].events=(interest&ZOOKEEPER_READ)?POLLIN:0;
fds[1].events|=(interest&ZOOKEEPER_WRITE)?POLLOUT:0;
maxfd=2;
}
timeout=tv.tv_sec * 1000 + (tv.tv_usec/1000);
poll(fds,maxfd,timeout);
if (fd != -1) {
interest=(fds[1].revents&POLLIN)?ZOOKEEPER_READ:0;
interest|=((fds[1].revents&POLLOUT)||(fds[1].revents&POLLHUP))?ZOOKEEPER_WRITE:0;
}
if(fds[0].revents&POLLIN){
// flush the pipe
char b[128];
while(read(adaptor_threads->self_pipe[0],b,sizeof(b))==sizeof(b)){}
}
#else
fd_set rfds, wfds, efds;
struct adaptor_threads *adaptor_threads = zh->adaptor_priv;
api_prolog(zh);
notify_thread_ready(zh);
LOG_DEBUG(("started IO thread"));
FD_ZERO(&rfds); FD_ZERO(&wfds); FD_ZERO(&efds);
while(!zh->close_requested) {
struct timeval tv;
SOCKET fd;
SOCKET maxfd=adaptor_threads->self_pipe[0];
int interest;
int rc;
zookeeper_interest(zh, &fd, &interest, &tv);
if (fd != -1) {
if (interest&ZOOKEEPER_READ) {
FD_SET(fd, &rfds);
} else {
FD_CLR(fd, &rfds);
}
if (interest&ZOOKEEPER_WRITE) {
FD_SET(fd, &wfds);
} else {
FD_CLR(fd, &wfds);
}
}
FD_SET( adaptor_threads->self_pipe[0] ,&rfds );
rc = select((int)maxfd, &rfds, &wfds, &efds, &tv);
if (fd != -1)
{
interest = (FD_ISSET(fd, &rfds))? ZOOKEEPER_READ:0;
interest|= (FD_ISSET(fd, &wfds))? ZOOKEEPER_WRITE:0;
}
if (FD_ISSET(adaptor_threads->self_pipe[0], &rfds)){
// flush the pipe/socket
char b[128];
int read_count = 0;
while((read_count = recv(adaptor_threads->self_pipe[0],b,sizeof(b), 0))==sizeof(b)){}
if (read_count == -1)
{
LOG_ERROR(("由于通讯失效,重置检查描述字集."));
FD_ZERO(&rfds);
FD_ZERO(&wfds);
FD_ZERO(&efds);
}
}
#endif
// dispatch zookeeper events
rc = zookeeper_process(zh, interest);
// check the current state of the zhandle and terminate
// if it is_unrecoverable()
if(is_unrecoverable(zh))
break;
}
api_epilog(zh, 0);
LOG_DEBUG(("IO thread terminated"));
return 0;
}