一.报错信息
2019-05-21 09:47:23.702 [destination = 127_0_0_1-3306 , address = /127.0.0.1:3306 , EventParser] WARN c.a.o.canal.parse.inbound.mysql.dbsync.DirectLogFetcher - Received EOF packet from server, apparent master disconnected. It's may be duplicate slaveId , check instance config
2019-05-21 09:47:36.162 [destination = 127_0_0_1-3306 , address = /127.0.0.1:3306 , EventParser] ERROR c.a.o.c.p.inbound.mysql.rds.RdsBinlogEventParserProxy - dump address localhost/127.0.0.1:3306 has an error, retrying. caused by
com.alibaba.otter.canal.parse.exception.CanalParseException: java.io.IOException: connect localhost/127.0.0.1:3306 failure
Caused by: java.io.IOException: connect localhost/127.0.0.1:3306 failure
at com.alibaba.otter.canal.parse.driver.mysql.MysqlConnector.connect(MysqlConnector.java:77) ~[canal.parse.driver-1.1.2.jar:na]
at com.alibaba.otter.canal.parse.inbound.mysql.MysqlConnection.connect(MysqlConnection.java:88) ~[canal.parse-1.1.2.jar:na]
at com.alibaba.otter.canal.parse.inbound.mysql.MysqlEventParser.preDump(MysqlEventParser.java:86) ~[canal.parse-1.1.2.jar:na]
at com.alibaba.otter.canal.parse.inbound.AbstractEventParser$3.run(AbstractEventParser.java:175) ~[canal.parse-1.1.2.jar:na]
at java.lang.Thread.run(Thread.java:748) [na:1.8.0_181]
Caused by: java.net.ConnectException: Connection refused (Connection refused)
at java.net.PlainSocketImpl.socketConnect(Native Method) ~[na:1.8.0_181]
at java.net.AbstractPlainSocketImpl.doConnect(AbstractPlainSocketImpl.java:350) ~[na:1.8.0_181]
at java.net.AbstractPlainSocketImpl.connectToAddress(AbstractPlainSocketImpl.java:206) ~[na:1.8.0_181]
at java.net.AbstractPlainSocketImpl.connect(AbstractPlainSocketImpl.java:188) ~[na:1.8.0_181]
at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:392) ~[na:1.8.0_181]
at java.net.Socket.connect(Socket.java:589) ~[na:1.8.0_181]
at com.alibaba.otter.canal.parse.driver.mysql.socket.BioSocketChannelPool.open(BioSocketChannelPool.java:18) ~[canal.parse.driver-1.1.2.jar:na]
at com.alibaba.otter.canal.parse.driver.mysql.socket.SocketChannelPool.open(SocketChannelPool.java:18) ~[canal.parse.driver-1.1.2.jar:na]
at com.alibaba.otter.canal.parse.driver.mysql.MysqlConnector.connect(MysqlConnector.java:72) ~[canal.parse.driver-1.1.2.jar:na]
... 4 common frames omitted
二.出错流程
我们先来看一下正常的流程:
核心代码为AbstractEventParser.java中的start()方法:
public void start() {
super.start();
MDC.put("destination", destination);
// 配置transaction buffer
// 初始化缓冲队列
transactionBuffer.setBufferSize(transactionSize);// 设置buffer大小
transactionBuffer.start();
// 构造bin log parser
binlogParser = buildParser();// 初始化一下BinLogParser
binlogParser.start();
// 启动工作线程
parseThread = new Thread(new Runnable() {
public void run() {
MDC.put("destination", String.valueOf(destination));
ErosaConnection erosaConnection = null;
while (running) {
try {
// 开始执行replication
// 1. 构造Erosa连接
erosaConnection = buildErosaConnection();
// 2. 启动一个心跳线程
startHeartBeat(erosaConnection);
// 3. 执行dump前的准备工作
preDump(erosaConnection);
erosaConnection.connect();// 链接
long queryServerId = erosaConnection.queryServerId();
if (queryServerId != 0) {
serverId = queryServerId;
}
// 4. 获取最后的位置信息
long start = System.currentTimeMillis();
logger.warn("---> begin to find start position, it will be long time for reset or first position");
EntryPosition position = findStartPosition(erosaConnection);
final EntryPosition startPosition = position;
if (startPosition == null) {
throw new PositionNotFoundException("can't find start position for " + destination);
}
if (!processTableMeta(startPosition)) {
throw new CanalParseException("can't find init table meta for " + destination
+ " with position : " + startPosition);
}
long end = System.currentTimeMillis();
logger.warn("---> find start position successfully, {}", startPosition.toString() + " cost : "
+ (end - start)
+ "ms , the next step is binlog dump");
// 重新链接,因为在找position过程中可能有状态,需要断开后重建
erosaConnection.reconnect();
final SinkFunction sinkHandler = new SinkFunction<EVENT>() {
private LogPosition lastPosition;
public boolean sink(EVENT event) {
try {
CanalEntry.Entry entry = parseAndProfilingIfNecessary(event, false);
if (!running) {
return false;
}
if (entry != null) {
exception = null; // 有正常数据流过,清空exception
transactionBuffer.add(entry);
// 记录一下对应的positions
this.lastPosition = buildLastPosition(entry);
// 记录一下最后一次有数据的时间
lastEntryTime = System.currentTimeMillis();
}
return running;
} catch (TableIdNotFoundException e) {
throw e;
} catch (Throwable e) {
if (e.getCause() instanceof TableIdNotFoundException) {
throw (TableIdNotFoundException) e.getCause();
}
// 记录一下,出错的位点信息
processSinkError(e,
this.lastPosition,
startPosition.getJournalName(),
startPosition.getPosition());
throw new CanalParseException(e); // 继续抛出异常,让上层统一感知
}
}
};
// 4. 开始dump数据
if (parallel) {
// build stage processor
multiStageCoprocessor = buildMultiStageCoprocessor();
if (isGTIDMode()) {
// 判断所属instance是否启用GTID模式,是的话调用ErosaConnection中GTID对应方法dump数据
GTIDSet gtidSet = MysqlGTIDSet.parse(startPosition.getGtid());
((MysqlMultiStageCoprocessor) multiStageCoprocessor).setGtidSet(gtidSet);
multiStageCoprocessor.start();
erosaConnection.dump(gtidSet, multiStageCoprocessor);
} else {
multiStageCoprocessor.start();
if (StringUtils.isEmpty(startPosition.getJournalName())
&& startPosition.getTimestamp() != null) {
erosaConnection.dump(startPosition.getTimestamp(), multiStageCoprocessor);
} else {
erosaConnection.dump(startPosition.getJournalName(),
startPosition.getPosition(),
multiStageCoprocessor);
}
}
} else {
if (isGTIDMode()) {
// 判断所属instance是否启用GTID模式,是的话调用ErosaConnection中GTID对应方法dump数据
erosaConnection.dump(MysqlGTIDSet.parse(startPosition.getGtid()), sinkHandler);
} else {
if (StringUtils.isEmpty(startPosition.getJournalName())
&& startPosition.getTimestamp() != null) {
erosaConnection.dump(startPosition.getTimestamp(), sinkHandler);
} else {
erosaConnection.dump(startPosition.getJournalName(),
startPosition.getPosition(),
sinkHandler);
}
}
}
} catch (TableIdNotFoundException e) {
exception = e;
// 特殊处理TableIdNotFound异常,出现这样的异常,一种可能就是起始的position是一个事务当中,导致tablemap
// Event时间没解析过
needTransactionPosition.compareAndSet(false, true);
logger.error(String.format("dump address %s has an error, retrying. caused by ",
runningInfo.getAddress().toString()), e);
} catch (Throwable e) {
processDumpError(e);
exception = e;
if (!running) {
if (!(e instanceof java.nio.channels.ClosedByInterruptException || e.getCause() instanceof java.nio.channels.ClosedByInterruptException)) {
throw new CanalParseException(String.format("dump address %s has an error, retrying. ",
runningInfo.getAddress().toString()), e);
}
} else {
logger.error(String.format("dump address %s has an error, retrying. caused by ",
runningInfo.getAddress().toString()), e);
sendAlarm(destination, ExceptionUtils.getFullStackTrace(e));
}
if (parserExceptionHandler != null) {
parserExceptionHandler.handle(e);
}
} finally {
// 重新置为中断状态
Thread.interrupted();
// 关闭一下链接
afterDump(erosaConnection);
try {
if (erosaConnection != null) {
erosaConnection.disconnect();
}
} catch (IOException e1) {
if (!running) {
throw new CanalParseException(String.format("disconnect address %s has an error, retrying. ",
runningInfo.getAddress().toString()),
e1);
} else {
logger.error("disconnect address {} has an error, retrying., caused by ",
runningInfo.getAddress().toString(),
e1);
}
}
}
// 出异常了,退出sink消费,释放一下状态
eventSink.interrupt();
transactionBuffer.reset();// 重置一下缓冲队列,重新记录数据
binlogParser.reset();// 重新置位
if (multiStageCoprocessor != null && multiStageCoprocessor.isStart()) {
// 处理 RejectedExecutionException
try {
multiStageCoprocessor.stop();
} catch (Throwable t) {
logger.debug("multi processor rejected:", t);
}
}
if (running) {
// sleep一段时间再进行重试
try {
Thread.sleep(10000 + RandomUtils.nextInt(10000));
} catch (InterruptedException e) {
}
}
}
MDC.remove("destination");
}
});
parseThread.setUncaughtExceptionHandler(handler);
parseThread.setName(String.format("destination = %s , address = %s , EventParser",
destination,
runningInfo == null ? null : runningInfo.getAddress()));
parseThread.start();
}
主要看一下工作线程的步骤:
- 构造Erosa连接
- 启动一个心跳线程
- 执行dump前的准备工作
- 获取最后的位置信息
- 开始dump数据
正常情况会进入第5步dump数据,然后会不停的拉取数据:
public void dump(String binlogfilename, Long binlogPosition, SinkFunction func) throws IOException {
updateSettings();
loadBinlogChecksum();
sendRegisterSlave();
sendBinlogDump(binlogfilename, binlogPosition);
DirectLogFetcher fetcher = new DirectLogFetcher(connector.getReceiveBufferSize());
fetcher.start(connector.getChannel());
LogDecoder decoder = new LogDecoder(LogEvent.UNKNOWN_EVENT, LogEvent.ENUM_END_EVENT);
LogContext context = new LogContext();
context.setFormatDescription(new FormatDescriptionLogEvent(4, binlogChecksum));
while (fetcher.fetch()) {
accumulateReceivedBytes(fetcher.limit());
LogEvent event = null;
event = decoder.decode(fetcher, context);
if (event == null) {
throw new CanalParseException("parse failed");
}
if (!func.sink(event)) {
break;
}
if (event.getSemival() == 1) {
sendSemiAck(context.getLogPosition().getFileName(), context.getLogPosition().getPosition());
}
}
}
我们看到fetcher会不断的fetch数据,当连接中断时,会怎么样?
我们进入代码内部看一下:
public boolean fetch() throws IOException {
try {
// Fetching packet header from input.
if (!fetch0(0, NET_HEADER_SIZE)) {
logger.warn("Reached end of input stream while fetching header");
return false;
}
// Fetching the first packet(may a multi-packet).
// 3字节包长度
int netlen = getUint24(PACKET_LEN_OFFSET);
// 1字节包序号
int netnum = getUint8(PACKET_SEQ_OFFSET);
if (!fetch0(NET_HEADER_SIZE, netlen)) {
logger.warn("Reached end of input stream: packet #" + netnum + ", len = " + netlen);
return false;
}
// Detecting error code.
// 状态码 0:OK,255:ERROR,254:EOF
final int mark = getUint8(NET_HEADER_SIZE);
if (mark != 0) {
if (mark == 255) // error from master
{
// Indicates an error, for example trying to fetch from
// wrong
// binlog position.
position = NET_HEADER_SIZE + 1;
final int errno = getInt16();
String sqlstate = forward(1).getFixString(SQLSTATE_LENGTH);
String errmsg = getFixString(limit - position);
throw new IOException("Received error packet:" + " errno = " + errno + ", sqlstate = " + sqlstate
+ " errmsg = " + errmsg);
} else if (mark == 254) {
// Indicates end of stream. It's not clear when this would
// be sent.
logger.warn("Received EOF packet from server, apparent"
+ " master disconnected. It's may be duplicate slaveId , check instance config");
return false;
} else {
// Should not happen.
throw new IOException("Unexpected response " + mark + " while fetching binlog: packet #" + netnum
+ ", len = " + netlen);
}
}
......
}
从最开始的错误日志可以分析出:返回的mark码为254,fetch方法直接返回了,然后dump方法返回,因为处在while循环,从第一步开始重新构建连接,但是到了第三步dump前的准备工作时会抛异常,我们看一下代码:
protected void preDump(ErosaConnection connection) {
if (!(connection instanceof MysqlConnection)) {
throw new CanalParseException("Unsupported connection type : " + connection.getClass().getSimpleName());
}
if (binlogParser != null && binlogParser instanceof LogEventConvert) {
metaConnection = (MysqlConnection) connection.fork();
try {
metaConnection.connect();
} catch (IOException e) {
throw new CanalParseException(e);
}
......
}
在这个方法中,会重新fork出一个connection,在connection连接的时候,由于mysql连接不上,抛出解析异常。
在工作线程当中捕获异常,并打印出开始的错误日志。