其实官网上写的都很清楚,在这边整合下做个记录
给需要的小伙伴看看
直接上代码
@Slf4j
public class Sink2HiveTask {
public static void main(String[] args) throws Exception {
String topic = "test5";
SparkConf sparkConf = new SparkConf().setMaster("local[2]").setAppName("test");
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
//sparkConf.set("spark.kryo.registrator", "com.ykc.task.MyRegistrator"); //序列化ConsumerRecord类
//sparkConf.set("spark.rdd.compress", "true"); // rdd的压缩
//sparkConf.set("spark.kryo.registrator", MyRegistrator.class.getName());
SparkSession ss = SparkSession.builder()
.config(sparkConf)
.enableHiveSupport()
.getOrCreate();
JavaStreamingContext jsc = new JavaStreamingContext(new JavaSparkContext(ss.sparkContext()), Durations.seconds(20));
// 消费kafka,这边是重点!
JavaInputDStream<ConsumerRecord<String, String>> stream = KafkaParamsConfig.buildKafk aSourceDStream(topic, jsc);
ss.sql("set hive.exec.dynamic.partition = true");
ss.sql("set hive.exec.dynamic.partition.mode = nonstrict");
String hiveDatabase = PropConfig.getProperty("ykc.hive.database");
ss.sql("use " + hiveDatabase);
stream.foreachRDD(new VoidFunction<JavaRDD<ConsumerRecord<String, String>>>() {
private static final long serialVersionUID = 1L;
@Override
public void call(JavaRDD<ConsumerRecord<String, String>> javaRDD) throws Exception {
OffsetRange[] offsetRanges = ((HasOffsetRanges) javaRDD.rdd()).offsetRanges();
// 处理数据
// 此处略过,看自己具体业务需求
// .....
// 数据写入成功后把每个分区下的offset存入redis
Map<String, String> redisMap = Maps.newHashMap();
for (OffsetRange offsetRange : offsetRanges) {
redisMap.put(String.valueOf(offsetRange.partition()), String.valueOf(offsetRange.fromOffset()));
}
RedisUtil.hSetKV(topic, redisMap);
// 提交offset
((CanCommitOffsets) stream.inputDStream()).commitAsync(offsetRanges);
}
});
jsc.start();
jsc.awaitTermination();
}
}
<