背景
线上几亿的数据在回刷的时候容器服务会出现OOM而重启,导致任务中断
内存泄露分析
jmap -histo pid
找出了有几十亿的java.lang.StackTraceElement对象,找不到被谁引用了
jmap -dump:format=b,file=heapdump.hprof pid
dump内存
下载到本机mac上,用mat(MemoryAnalyzer)分析,得到内存泄露报告,看到内存全部被com.dianping.cat.message.internal.DefaultMessageManager$Context引用,找到了罪魁祸首
原因分析
com.dianping.cat.log4j.Log4j2Appender
在打印错误Exception的时候会调用Cat.logError方法
public class Log4j2Appender extends AbstractAppender {
....
public void append(LogEvent event) {
try {
Level level = event.getLevel();
if (level.isMoreSpecificThan(Level.WARN)) {
this.logError(event);
}
} catch (Exception var3) {
if (!this.ignoreExceptions()) {
throw new AppenderLoggingException(var3);
}
}
}
....
private void logError(LogEvent event) {
Throwable exception = event.getThrown();
if (exception != null) {
Message message = event.getMessage();
if (message != null) {
Cat.logError(message.getFormattedMessage(), exception);
} else {
Cat.logError(exception);
}
}
}
}
Cat
类的logError函数最终调用到了DefaultMessageManager.shouldLog方法
public class Cat {
...
public static void logError(String message, Throwable cause) {
try {
getProducer().logError(message, cause);
} catch (Exception var3) {
errorHandler(var3);
}
}
...
public static void logError(Throwable cause) {
try {
getProducer().logError(cause);
} catch (Exception var2) {
errorHandler(var2);
}
}
}
public class DefaultMessageProducer implements MessageProducer {
public void logError(String message, Throwable cause) {
if (Cat.getManager().isCatEnabled()) {
if (this.shouldLog(cause)) {
....
}
} else {
cause.printStackTrace();
}
}
private boolean shouldLog(Throwable e) {
return this.m_manager instanceof DefaultMessageManager ? ((DefaultMessageManager)this.m_manager).shouldLog(e) : true;
}
DefaultMessageManager
类的m_context在shouldLog的时候把异常堆栈保存下来了,如果Cat事务不关闭,随着异常越来越多就导致了内存溢出
public class DefaultMessageManager {
private ThreadLocal<DefaultMessageManager.Context> m_context = new ThreadLocal();
boolean shouldLog(Throwable e) {
DefaultMessageManager.Context ctx = (DefaultMessageManager.Context)this.m_context.get();
return ctx != null ? ctx.shouldLog(e) : true;
}
class Context {
// 内存不足就是由于错误堆栈信息没有限制导致的
private Set<Throwable> m_knownExceptions;
public Context(String domain, String hostName, String ipAddress) {
....
this.m_knownExceptions = new HashSet();
}
public boolean shouldLog(Throwable e) {
if (this.m_knownExceptions == null) {
this.m_knownExceptions = new HashSet();
}
if (this.m_knownExceptions.contains(e)) {
return false;
} else {
// 这里没有限制大小,只要有异常就往Set里面添加,这里应该做一个优化
this.m_knownExceptions.add(e);
return true;
}
}
}
}
解决方案
手动调用Cat.getManager().reset();
方法清空保存的异常堆栈信息
public void reset() {
DefaultMessageManager.Context ctx = (DefaultMessageManager.Context)this.m_context.get();
if (ctx != null) {
if (ctx.m_totalDurationInMicros == 0L) {
ctx.m_stack.clear();
ctx.m_knownExceptions.clear();
this.m_context.remove();
} else {
// 这里会释放错误日志堆栈信息
ctx.m_knownExceptions.clear();
}
}
}