java.lang.OutOfMemoryError: GC overhead limit exceeded
at org.apache.spark.unsafe.types.UTF8String.fromAddress(UTF8String.java:102)
at org.apache.spark.sql.catalyst.expressions.UnsafeRow.getUTF8String(UnsafeRow.java:419)
at org.apache.spark.sql.catalyst.expressions.JoinedRow.getUTF8String(JoinedRow.scala:102)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificPredicate.eval(Unknown Source)
at org.apache.spark.sql.execution.joins.SortMergeJoinExec$$anonfun$doExecute$1$$anonfun$1$$anonfun$apply$1.apply(SortMergeJoinExec.scala:114)
at org.apache.spark.sql.execution.joins.SortMergeJoinExec$$anonfun$doExecute$1$$anonfun$1$$anonfun$apply$1.apply(SortMergeJoinExec.scala:114)
at org.apache.spark.sql.execution.joins.OneSideOuterIterator.advanceBufferUntilBoundConditionSatisfied(SortMergeJoinExec.scala:874)
at org.apache.spark.sql.execution.joins.OneSideOuterIterator.advanceStream(SortMergeJoinExec.scala:855)
at org.apache.spark.sql.execution.joins.OneSideOuterIterator.advanceNext(SortMergeJoinExec.scala:881)
at org.apache.spark.sql.execution.RowIteratorToScala.hasNext(RowIterator.scala:68)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at org.apache.spark.sql.hive.SparkHiveWriterContainer.writeToFile(hiveWriterContainers.scala:184)
at org.apache.spark.sql.hive.execution.InsertIntoHiveTable$$anonfun$saveAsHiveFile$3.apply(InsertIntoHiveTable.scala:210)
at org.apache.spark.sql.hive.execution.InsertIntoHiveTable$$anonfun$saveAsHiveFile$3.apply(InsertIntoHiveTable.scala:210)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:325)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
18/11/26 17:24:37 INFO executor.Executor: Not reporting error to driver during JVM shutdown.
18/11/26 17:24:37 ERROR util.SparkUncaughtExceptionHandler: [Container in shutdown] Uncaught exception in thread Thread[Executor task launch worker for task 451,5,main]
java.lang.OutOfMemoryError: GC overhead limit exceeded
at org.apache.spark.unsafe.types.UTF8String.fromAddress(UTF8String.java:102)
at org.apache.spark.sql.catalyst.expressions.UnsafeRow.getUTF8String(UnsafeRow.java:419)
at org.apache.spark.sql.catalyst.expressions.JoinedRow.getUTF8String(JoinedRow.scala:102)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificPredicate.eval(Unknown Source)
at org.apache.spark.sql.execution.joins.SortMergeJoinExec$$anonfun$doExecute$1$$anonfun$1$$anonfun$apply$1.apply(SortMergeJoinExec.scala:114)
at org.apache.spark.sql.execution.joins.SortMergeJoinExec$$anonfun$doExecute$1$$anonfun$1$$anonfun$apply$1.apply(SortMergeJoinExec.scala:114)
at org.apache.spark.sql.execution.joins.OneSideOuterIterator.advanceBufferUntilBoundConditionSatisfied(SortMergeJoinExec.scala:874)
at org.apache.spark.sql.execution.joins.OneSideOuterIterator.advanceStream(SortMergeJoinExec.scala:855)
at org.apache.spark.sql.execution.joins.OneSideOuterIterator.advanceNext(SortMergeJoinExec.scala:881)
at org.apache.spark.sql.execution.RowIteratorToScala.hasNext(RowIterator.scala:68)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
at scala.collection.Iterator$class.foreach(Iterator.scala:893)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
at org.apache.spark.sql.hive.SparkHiveWriterContainer.writeToFile(hiveWriterContainers.scala:184)
at org.apache.spark.sql.hive.execution.InsertIntoHiveTable$$anonfun$saveAsHiveFile$3.apply(InsertIntoHiveTable.scala:210)
at org.apache.spark.sql.hive.execution.InsertIntoHiveTable$$anonfun$saveAsHiveFile$3.apply(InsertIntoHiveTable.scala:210)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:99)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:325)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
集群节点数:20台
可用内存大小:1024G
每台内核:48核
使用SparkSQL读取数据写入Hive表
spark2-submit \
--class com.lhx.dac.test \
--master yarn \
dac-test.jar
默认方式执行,集群使用内存800G,报错OOM
查看CM的Hive配置:
--conf spark.yarn.driver.memoryOverhead=2048m \
--conf spark.yarn.executor.memoryOverhead=2048m \
--conf spark.dynamicAllocation.enabled=false \
查看内存和核数比:3:1
设置参数提高GC内存,
spark.executor.memory 调大参数扩大内存(默认512M,调整为2G),
修改后Spark执行命令:
spark2-submit \
--class com.lhx.dac.test \
--master yarn \
--deploy-mode cluster \
--driver-memory 6g \
--executor-memory 6g \
--executor-cores 2 \
--conf spark.yarn.driver.memoryOverhead=2048m \
--conf spark.yarn.executor.memoryOverhead=2048m \
dac-test.jar
集群使用内存900多G,成功执行程序。
总结:内存分配采用动态,堆栈内存是静态分配。