最近接手的一个C/S架构的小程序,据说一直有个小问题:运行一段时间后三个socket中的一个就无法send数据,PRE环境正常,PRD环境才会出现该问题。
前两天又被我遇到了,看了一下代码send的时候对链接状态都做了判断,看了socket状态也是ESTABLISHED。分析原因可能是该socket长时间没数据,然后中间设备超时导致的。
增加keepalive验证了几天,就没再出现问题。
keepalive代码直接用的redis中的源码:
/* Set TCP keep alive option to detect dead peers. The interval option
* is only used for Linux as we are using Linux-specific APIs to set
* the probe send time, interval, and count. */
int anetKeepAlive(char *err, int fd, int interval)
{
int val = 1;
if (setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &val, sizeof(val)) == -1)
{
anetSetError(err, "setsockopt SO_KEEPALIVE: %s", strerror(errno));
return ANET_ERR;
}
#ifdef __linux__
/* Default settings are more or less garbage, with the keepalive time
* set to 7200 by default on Linux. Modify settings to make the feature
* actually useful. */
/* Send first probe after interval. */
val = interval;
if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPIDLE, &val, sizeof(val)) < 0) {
anetSetError(err, "setsockopt TCP_KEEPIDLE: %s\n", strerror(errno));
return ANET_ERR;
}
/* Send next probes after the specified interval. Note that we set the
* delay as interval / 3, as we send three probes before detecting
* an error (see the next setsockopt call). */
val = interval/3;
if (val == 0) val = 1;
if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPINTVL, &val, sizeof(val)) < 0) {
anetSetError(err, "setsockopt TCP_KEEPINTVL: %s\n", strerror(errno));
return ANET_ERR;
}
/* Consider the socket in error state after three we send three ACK
* probes without getting a reply. */
val = 3;
if (setsockopt(fd, IPPROTO_TCP, TCP_KEEPCNT, &val, sizeof(val)) < 0) {
anetSetError(err, "setsockopt TCP_KEEPCNT: %s\n", strerror(errno));
return ANET_ERR;
}
#else
((void) interval); /* Avoid unused var warning for non Linux systems. */
#endif
return ANET_OK;
}