Flink消费kafka写入hive,报错GC

报错信息

Flink消费kafka写入hive,报错GC。Taskmanager内存加到16GB。不起作用。

java.lang.OutOfMemoryError: GC overhead limit exceeded
	at java.nio.HeapByteBuffer.<init>(HeapByteBuffer.java:57) ~[?:1.8.0_181]
	at java.nio.ByteBuffer.allocate(ByteBuffer.java:335) ~[?:1.8.0_181]
	at org.apache.hive.orc.impl.OutStream.getNewInputBuffer(OutStream.java:109) ~[flink-sql-connector-hive-2.2.0_2.11-1.14.2.jar:1.14.2]
	at org.apache.hive.orc.impl.OutStream.write(OutStream.java:142) ~[flink-sql-connector-hive-2.2.0_2.11-1.14.2.jar:1.14.2]
	at com.google.protobuf.CodedOutputStream.refreshBuffer(CodedOutputStream.java:833) ~[flink-sql-connector-hive-2.2.0_2.11-1.14.2.jar:1.14.2]
	at com.google.protobuf.CodedOutputStream.flush(CodedOutputStream.java:843) ~[flink-sql-connector-hive-2.2.0_2.11-1.14.2.jar:1.14.2]
	at com.google.protobuf.AbstractMessageLite.writeTo(AbstractMessageLite.java:80) ~[flink-sql-connector-hive-2.2.0_2.11-1.14.2.jar:1.14.2]
	at org.apache.hive.orc.impl.PhysicalFsWriter.writeIndexStream(PhysicalFsWriter.java:512) ~[flink-sql-connector-hive-2.2.0_2.11-1.14.2.jar:1.14.2]
	at org.apache.hive.orc.impl.WriterImpl$StreamFactory.writeIndex(WriterImpl.java:221) ~[flink-sql-connector-hive-2.2.0_2.11-1.14.2.jar:1.14.2]
	at org.apache.hive.orc.impl.WriterImpl$TreeWriter.writeStripe(WriterImpl.java:531) ~[flink-sql-connector-hive-2.2.0_2.11-1.14.2.jar:1.14.2]
	at org.apache.hive.orc.impl.WriterImpl$StringBaseTreeWriter.writeStripe(WriterImpl.java:1007) ~[flink-sql-connector-hive-2.2.0_2.11-1.14.2.jar:1.14.2]
	at org.apache.hive.orc.impl.WriterImpl$StructTreeWriter.writeStripe(WriterImpl.java:1786) ~[flink-sql-connector-hive-2.2.0_2.11-1.14.2.jar:1.14.2]
	at org.apache.hive.orc.impl.WriterImpl.flushStripe(WriterImpl.java:2171) ~[flink-sql-connector-hive-2.2.0_2.11-1.14.2.jar:1.14.2]
	at org.apache.hive.orc.impl.WriterImpl.close(WriterImpl.java:2335) ~[flink-sql-connector-hive-2.2.0_2.11-1.14.2.jar:1.14.2]
	at org.apache.hadoop.hive.ql.io.orc.WriterImpl.close(WriterImpl.java:330) ~[flink-sql-connector-hive-2.2.0_2.11-1.14.2.jar:1.14.2]
	at org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat$OrcRecordWriter.close(OrcOutputFormat.java:120) ~[flink-sql-connector-hive-2.2.0_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.connectors.hive.write.HiveBulkWriterFactory$1.finish(HiveBulkWriterFactory.java:79) ~[flink-sql-connector-hive-2.2.0_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.formats.hadoop.bulk.HadoopPathBasedPartFileWriter.closeForCommit(HadoopPathBasedPartFileWriter.java:71) ~[flink-sql-connector-hive-2.2.0_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.streaming.api.functions.sink.filesystem.Bucket.closePartFile(Bucket.java:262) ~[flink-dist_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.streaming.api.functions.sink.filesystem.Bucket.prepareBucketForCheckpointing(Bucket.java:304) ~[flink-dist_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.streaming.api.functions.sink.filesystem.Bucket.onReceptionOfCheckpoint(Bucket.java:276) ~[flink-dist_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.streaming.api.functions.sink.filesystem.Buckets.snapshotActiveBuckets(Buckets.java:270) ~[flink-dist_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.streaming.api.functions.sink.filesystem.Buckets.snapshotState(Buckets.java:261) ~[flink-dist_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSinkHelper.snapshotState(StreamingFileSinkHelper.java:87) ~[flink-dist_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.table.filesystem.stream.AbstractStreamingWriter.snapshotState(AbstractStreamingWriter.java:129) ~[flink-table_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.table.filesystem.stream.StreamingFileWriter.snapshotState(StreamingFileWriter.java:101) ~[flink-table_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.streaming.api.operators.StreamOperatorStateHandler.snapshotState(StreamOperatorStateHandler.java:219) ~[flink-dist_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.streaming.api.operators.StreamOperatorStateHandler.snapshotState(StreamOperatorStateHandler.java:170) ~[flink-dist_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.streaming.api.operators.AbstractStreamOperator.snapshotState(AbstractStreamOperator.java:348) ~[flink-dist_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.streaming.runtime.tasks.RegularOperatorChain.checkpointStreamOperator(RegularOperatorChain.java:233) ~[flink-dist_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.streaming.runtime.tasks.RegularOperatorChain.buildOperatorSnapshotFutures(RegularOperatorChain.java:206) ~[flink-dist_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.streaming.runtime.tasks.RegularOperatorChain.snapshotState(RegularOperatorChain.java:186) ~[flink-dist_2.11-1.14.2.jar:1.14.2]
java.lang.OutOfMemoryError: Java heap space
	at org.apache.hive.orc.impl.RunLengthIntegerWriterV2.<init>(RunLengthIntegerWriterV2.java:140) ~[flink-sql-connector-hive-2.2.0_2.11-1.14.2.jar:1.14.2]
	at org.apache.hive.orc.impl.WriterImpl$TreeWriter.createIntegerWriter(WriterImpl.java:398) ~[flink-sql-connector-hive-2.2.0_2.11-1.14.2.jar:1.14.2]
	at org.apache.hive.orc.impl.WriterImpl$IntegerTreeWriter.<init>(WriterImpl.java:745) ~[flink-sql-connector-hive-2.2.0_2.11-1.14.2.jar:1.14.2]
	at org.apache.hive.orc.impl.WriterImpl.createTreeWriter(WriterImpl.java:2087) ~[flink-sql-connector-hive-2.2.0_2.11-1.14.2.jar:1.14.2]
	at org.apache.hive.orc.impl.WriterImpl.access$1200(WriterImpl.java:88) ~[flink-sql-connector-hive-2.2.0_2.11-1.14.2.jar:1.14.2]
	at org.apache.hive.orc.impl.WriterImpl$StructTreeWriter.<init>(WriterImpl.java:1720) ~[flink-sql-connector-hive-2.2.0_2.11-1.14.2.jar:1.14.2]
	at org.apache.hive.orc.impl.WriterImpl.createTreeWriter(WriterImpl.java:2117) ~[flink-sql-connector-hive-2.2.0_2.11-1.14.2.jar:1.14.2]
	at org.apache.hive.orc.impl.WriterImpl.<init>(WriterImpl.java:161) ~[flink-sql-connector-hive-2.2.0_2.11-1.14.2.jar:1.14.2]
	at org.apache.hive.orc.impl.WriterImpl.<init>(WriterImpl.java:126) ~[flink-sql-connector-hive-2.2.0_2.11-1.14.2.jar:1.14.2]
	at org.apache.hadoop.hive.ql.io.orc.WriterImpl.<init>(WriterImpl.java:94) ~[flink-sql-connector-hive-2.2.0_2.11-1.14.2.jar:1.14.2]
	at org.apache.hadoop.hive.ql.io.orc.OrcFile.createWriter(OrcFile.java:314) ~[flink-sql-connector-hive-2.2.0_2.11-1.14.2.jar:1.14.2]
	at org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat$OrcRecordWriter.write(OrcOutputFormat.java:101) ~[flink-sql-connector-hive-2.2.0_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.connectors.hive.write.HiveBulkWriterFactory$1.addElement(HiveBulkWriterFactory.java:71) ~[flink-sql-connector-hive-2.2.0_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.connectors.hive.write.HiveBulkWriterFactory$1.addElement(HiveBulkWriterFactory.java:51) ~[flink-sql-connector-hive-2.2.0_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.formats.hadoop.bulk.HadoopPathBasedPartFileWriter.write(HadoopPathBasedPartFileWriter.java:59) ~[flink-sql-connector-hive-2.2.0_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.streaming.api.functions.sink.filesystem.Bucket.write(Bucket.java:222) ~[flink-dist_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.streaming.api.functions.sink.filesystem.Buckets.onElement(Buckets.java:305) ~[flink-dist_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSinkHelper.onElement(StreamingFileSinkHelper.java:103) ~[flink-dist_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.table.filesystem.stream.AbstractStreamingWriter.processElement(AbstractStreamingWriter.java:140) ~[flink-table_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.streaming.runtime.tasks.CopyingChainingOutput.pushToOperator(CopyingChainingOutput.java:82) ~[flink-dist_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.streaming.runtime.tasks.CopyingChainingOutput.collect(CopyingChainingOutput.java:57) ~[flink-dist_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.streaming.runtime.tasks.CopyingChainingOutput.collect(CopyingChainingOutput.java:29) ~[flink-dist_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.streaming.api.operators.CountingOutput.collect(CountingOutput.java:56) ~[flink-dist_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.streaming.api.operators.CountingOutput.collect(CountingOutput.java:29) ~[flink-dist_2.11-1.14.2.jar:1.14.2]
	at StreamExecCalc$85.processElement_split6(Unknown Source) ~[?:?]
	at StreamExecCalc$85.processElement(Unknown Source) ~[?:?]
	at org.apache.flink.streaming.runtime.tasks.CopyingChainingOutput.pushToOperator(CopyingChainingOutput.java:82) ~[flink-dist_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.streaming.runtime.tasks.CopyingChainingOutput.collect(CopyingChainingOutput.java:57) ~[flink-dist_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.streaming.runtime.tasks.CopyingChainingOutput.collect(CopyingChainingOutput.java:29) ~[flink-dist_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.streaming.runtime.tasks.SourceOperatorStreamTask$AsyncDataOutputToOutput.emitRecord(SourceOperatorStreamTask.java:196) ~[flink-dist_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.streaming.api.operators.source.SourceOutputWithWatermarks.collect(SourceOutputWithWatermarks.java:110) ~[flink-dist_2.11-1.14.2.jar:1.14.2]
	at org.apache.flink.connector.kafka.source.reader.KafkaRecordEmitter.emitRecord(KafkaRecordEmitter.java:36) ~[flink-sql-connector-kafka_2.11-1.14.2.jar:1.14.2]

解决方法

TaskManager内存加到16GB,仍然GC。根据任务情况,肯定不是内存设置的不足问题。
参考文章:

https://community.cloudera.com/t5/Support-Questions/I-am-getting-outofmemory-while-inserting-the-data-into-table/m-p/119682

原因是:写入的hive表分区数过多(分钟级),hive的格式是ORC。
ORC 编写器为每个输出文件保持一个缓冲区打开。因此,如果您严重加载到分区表,它们将保持大量内存打开。所以即使加到16GB也无济于事。
解决办法:将hive表的格式由ORC格式改成text格式,任务运行正常。

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
以下是使用Java编Flink消费Kafka写入Hive的示例代码: 1. 导入依赖 ```java import org.apache.flink.api.common.functions.MapFunction; import org.apache.flink.api.common.serialization.SimpleStringSchema; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer; import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer; import org.apache.flink.streaming.util.serialization.KeyedSerializationSchemaWrapper; import org.apache.flink.streaming.util.serialization.SimpleStringSchema; ``` 2. 配置Kafka连接 ```java String kafkaBootstrapServers = "localhost:9092"; String kafkaTopic = "test"; Properties kafkaProps = new Properties(); kafkaProps.setProperty("bootstrap.servers", kafkaBootstrapServers); kafkaProps.setProperty("group.id", "flink-group"); ``` 3. 创建 Flink 环境和 Kafka 消费者 ```java StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); DataStream<String> kafkaStream = env.addSource(new FlinkKafkaConsumer<>(kafkaTopic, new SimpleStringSchema(), kafkaProps)); ``` 4. 对收到的消息进行处理 ```java DataStream<String> processedStream = kafkaStream.map(new MapFunction<String, String>() { @Override public String map(String value) throws Exception { // 在这里对数据进行处理,返回处理后的数据 return value; } }); ``` 5. 将处理后的数据写入 Hive ```java String hiveTableName = "test"; String hiveMetastoreUri = "thrift://localhost:9083"; String hiveDbName = "default"; String hivePartitionColumn = "dt"; String hivePartitionValue = "20220101"; String hiveOutputPath = "/user/hive/warehouse/" + hiveDbName + ".db/" + hiveTableName + "/" + hivePartitionColumn + "=" + hivePartitionValue; DataStream<String> hiveDataStream = processedStream.map(new MapFunction<String, String>() { @Override public String map(String value) throws Exception { // 在这里将数据转换为 Hive 表的格式,返回转换后的数据 return value; } }); // 将数据写入 Hive hiveDataStream.addSink(new FlinkHiveOutputFormat<>(new Path(hiveOutputPath), new org.apache.hadoop.hive.ql.io.orc.OrcSerde(), new Object[]{})); ``` 6. 将处理后的数据Kafka ```java String kafkaOutputTopic = "output"; FlinkKafkaProducer<String> kafkaProducer = new FlinkKafkaProducer<>(kafkaBootstrapServers, kafkaOutputTopic, new KeyedSerializationSchemaWrapper<>(new SimpleStringSchema()), kafkaProps); // 将数据Kafka processedStream.addSink(kafkaProducer); ``` 完整示例代码: ```java import org.apache.flink.api.common.functions.MapFunction; import org.apache.flink.api.common.serialization.SimpleStringSchema; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer; import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer; import org.apache.flink.streaming.util.serialization.KeyedSerializationSchemaWrapper; import org.apache.flink.streaming.util.serialization.SimpleStringSchema; import java.util.Properties; public class FlinkKafkaToHiveDemo { public static void main(String[] args) throws Exception { String kafkaBootstrapServers = "localhost:9092"; String kafkaTopic = "test"; Properties kafkaProps = new Properties(); kafkaProps.setProperty("bootstrap.servers", kafkaBootstrapServers); kafkaProps.setProperty("group.id", "flink-group"); String hiveTableName = "test"; String hiveMetastoreUri = "thrift://localhost:9083"; String hiveDbName = "default"; String hivePartitionColumn = "dt"; String hivePartitionValue = "20220101"; String hiveOutputPath = "/user/hive/warehouse/" + hiveDbName + ".db/" + hiveTableName + "/" + hivePartitionColumn + "=" + hivePartitionValue; StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); DataStream<String> kafkaStream = env.addSource(new FlinkKafkaConsumer<>(kafkaTopic, new SimpleStringSchema(), kafkaProps)); DataStream<String> processedStream = kafkaStream.map(new MapFunction<String, String>() { @Override public String map(String value) throws Exception { // 在这里对数据进行处理,返回处理后的数据 return value; } }); DataStream<String> hiveDataStream = processedStream.map(new MapFunction<String, String>() { @Override public String map(String value) throws Exception { // 在这里将数据转换为 Hive 表的格式,返回转换后的数据 return value; } }); DataStream<String> kafkaOutputStream = processedStream.map(new MapFunction<String, String>() { @Override public String map(String value) throws Exception { // 在这里对数据进行处理,返回处理后的数据 return value; } }); FlinkKafkaProducer<String> kafkaProducer = new FlinkKafkaProducer<>(kafkaBootstrapServers, kafkaOutputTopic, new KeyedSerializationSchemaWrapper<>(new SimpleStringSchema()), kafkaProps); processedStream.addSink(kafkaProducer); hiveDataStream.addSink(new FlinkHiveOutputFormat<>(new Path(hiveOutputPath), new org.apache.hadoop.hive.ql.io.orc.OrcSerde(), new Object[]{})); env.execute("FlinkKafkaToHiveDemo"); } } ```

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值