最近有个Linux客户使用我们开发的Database功能, 数据会从一台机器自动备份到另外一台机器,启动没问题但是一同步就报RPC问题, 第一感觉是防火墙问题,不过客户死活不认同。
遇到网络问题一般怎么定位哪?
1. 通过tcpdump抓包。
发现无缘无故收到一个RST包,连接就被中断了。
2. 检查防火墙,查看iptables/firewall rules.
客户主动发了rules过来,没有异常。
怎么办? 为了证明这个问题与我们的产品没关系,写了一个非常简单的socket通信程序模拟replication机器之间的正常通信, 这样的通信如果被中断了就和业务无关了吧? 最后确实是客户网络的问题:
We had a webex with the customer and the issue was in the vmware hypervisor firewall, the subscribing machine was not part of the same firewall within the hypervisor as the publisher. Once they were within the same group and the rules, the test program and then subsequently replication worked.”
简单的通信程序如下:
/*************************************************************************
client
************************************************************************/
#include<stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <netdb.h>
#include <errno.h>
void error(const char *msg)
{
perror(msg);
exit(0);
}
int bytes[]={41,36,68,48,220,48,56,80};
int main(int argc, char *argv[])
{
int sockfd, portno, n, i;
struct sockaddr_in serv_addr;
struct hostent *server;
char send_buffer[256];
char read_buffer[256];
if (argc < 3) {
fprintf(stderr,"usage %s hostname port\n", argv[0]);
exit(0);
}
portno = atoi(argv[2]);
sockfd = socket(AF_INET, SOCK_STREAM, 0);
if (sockfd < 0)
error("ERROR opening socket");
server = gethostbyname(argv[1]);
if (server == NULL) {
fprintf(stderr,"ERROR, no such host\n");
exit(0);
}
bzero((char *) &serv_addr, sizeof(serv_addr));
serv_addr.sin_family = AF_INET;
bcopy((char *)server->h_addr,
(char *)&serv_addr.sin_addr.s_addr,
server->h_length);
serv_addr.sin_port = htons(portno);
if (connect(sockfd,(struct sockaddr *) &serv_addr,sizeof(serv_addr)) < 0)
error("ERROR connecting");
while(1)
{
bzero(send_buffer,256);
memset(send_buffer, 'Y', 36);
printf("try to send message: %s\n", send_buffer);
n = write(sockfd,send_buffer,36);
if (n < 0) {
printf("write error! write return code=%d, errno=%d\n", n, errno);
}
bzero(read_buffer,256);
n = read(sockfd,read_buffer,36);
printf("got message=%s\n", read_buffer);
if (n != 36)
{
printf("read error! expect 36 bytes, but got %d, errno=%d\n", n, errno);
exit(-1);
}
sleep(4);
}
close(sockfd);
return 0;
}
/*************************************************************************
server
************************************************************************/
#include<stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <errno.h>
void error(const char *msg)
{
perror(msg);
exit(1);
}
int bytes[]={41,36,68,48,220,48,56,80};
int main(int argc, char *argv[])
{
int sockfd, newsockfd, portno;
socklen_t clilen;
char send_buffer[256];
char read_buffer[256];
struct sockaddr_in serv_addr, cli_addr;
int n,i;
if (argc < 2) {
fprintf(stderr,"ERROR, no port provided\n");
exit(1);
}
sockfd = socket(AF_INET, SOCK_STREAM, 0);
printf("Try to create socket\n");
if (sockfd < 0)
error("ERROR opening socket");
bzero((char *) &serv_addr, sizeof(serv_addr));
portno = atoi(argv[1]);
serv_addr.sin_family = AF_INET;
serv_addr.sin_addr.s_addr = INADDR_ANY;
serv_addr.sin_port = htons(portno);
if (bind(sockfd, (struct sockaddr *) &serv_addr,
sizeof(serv_addr)) < 0)
error("ERROR on binding");
printf("bind successfully\n");
listen(sockfd,1);
printf("listening\n");
clilen = sizeof(cli_addr);
newsockfd = accept(sockfd,
(struct sockaddr *) &cli_addr,
&clilen);
if (newsockfd < 0)
error("ERROR on accept");
printf("accept new connection\n");
while(1)
{
bzero(read_buffer,256);
n = read(newsockfd,read_buffer,36);
printf("got message=%s\n", read_buffer);
if (n != 36)
{
printf("read error! expect 36 bytes, but got %d, errno=%d\n", n, errno);
exit(-1);
}
bzero(send_buffer,256);
memset(send_buffer, 'Z', 36);
printf("try to send message: %s\n",send_buffer);
n = write(newsockfd,send_buffer,36);
if (n < 0) {
printf("write error! write return code=%d, errno=%d\n", n, errno);
}
sleep(4);
}
close(newsockfd);
close(sockfd);
return 0;
}

本文介绍了一种解决Linux环境下数据库复制过程中遇到的RPC问题的方法。通过编写简单的socket通信程序来排除业务代码故障,最终定位问题在于VMware虚拟机防火墙配置。此案例展示了网络问题排查的有效步骤。

被折叠的 条评论
为什么被折叠?



