SparkStreaming消费Kafka数据,实时批量插入ElasticSearch,Java版本

7 篇文章 0 订阅
6 篇文章 2 订阅

下列代码,涉及到数据Kafka接入,数据Spark算子数据处理,Kafka偏移量记录,数据反压,数据批量插入ElasticSearch等所有操作步骤。

package com.app;

import com.util.RedisUtil;
import org.apache.commons.lang.StringUtils;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka010.*;
import org.elasticsearch.action.bulk.BulkRequestBuilder;
import org.elasticsearch.action.index.IndexRequestBuilder;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.TransportAddress;
import org.elasticsearch.transport.client.PreBuiltTransportClient;
import scala.Tuple2;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.*;

public class RealTime_Data {

    //设置异常标记,如果发生异常,则记录起始偏移量,如果正常 则记录解释偏移量
    public static Boolean ExceptionFlag = true;
    //配置kafka参数(节点、消费者组、topic)
    private static String topics = "topics";         //指定topic
    private static String groupId = "groupId_cs4";   //指定消费者组id
    private static String offset = "offset";		 //偏移量的Key
    private static String brokers = "IP:9092,IP:9092,IP:9092";//指定kafka地

    public static void main(String[] args) throws InterruptedException {

        SparkConf conf = new SparkConf().setAppName("RealTime_Data");
        conf.set("spark.dynamicAllocation.enabled", "false");
        conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
        conf.set("spark.streaming.backpressure.enabled", "true");     //启用反压
        conf.set("spark.streaming.backpressure.pid.minRate","100");   //最小条数
        //当前调度最大消费条数=分区数*秒数*最大消费限制条数
		conf.set("spark.streaming.kafka.maxRatePerPartition","2500"); //最大条数
        conf.set("spark.speculation", "true");//开启资源动态调用
        //设置三线程
		conf.setMaster("local[3]");
		
        JavaSparkContext sc = new JavaSparkContext(conf);
        JavaStreamingContext ssc = new JavaStreamingContext(sc, Durations.seconds(60));
        Set<String> topicsSet = new HashSet<>(Arrays.asList(topics.split(",")));
        Map<String, Object> kafkaParams = new HashMap<>();
        kafkaParams.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers);
        kafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG, groupId);
        kafkaParams.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
        kafkaParams.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
        //每次程序启动获取最新的消费者偏移量
        kafkaParams.put("auto.offset.reset", "latest");
        //关闭消费之偏移量自动提交
        kafkaParams.put("enable.auto.commit", "false");

        HashMap<TopicPartition, Long> mapTopic = new HashMap<>();
        JavaInputDStream<ConsumerRecord<String, String>> messages =null;

        Boolean flag = RedisUtil.FlagExits(offset, 1);
        if(flag){
            Map<String, String> offsets = RedisUtil.getAll(offset, 1);
            for (Map.Entry<String, String> entry : offsets.entrySet()) {
                String partition = entry.getKey();
                String offset = entry.getValue();
                //截取去掉时间 只传入偏移量 避免程序错误
                String[] s = offset.split("_", -1);
                String offset_last = s[0];
                TopicPartition topicPartition = new TopicPartition(topics, Integer.valueOf(partition));
                mapTopic.put(topicPartition, Long.valueOf(offset_last));
            }
            messages = KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String>Subscribe(topicsSet, kafkaParams, mapTopic));
        }else{
            messages = KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String>Subscribe(topicsSet, kafkaParams));
        }

        JavaDStream<ConsumerRecord<String, String>> v1 = messages.filter(new Function<ConsumerRecord<String, String>, Boolean>() {
            @Override
            public Boolean call(ConsumerRecord<String, String> v1) throws Exception {
                //判断   数据                     //判断WF不为空                                                     判断时间不为空
                if (v1.key().startsWith("data") && StringUtils.isNotBlank(v1.value().split(",", -1)[9])) {
                    return true;
                }
                return false;
            }
        });

        //数据规范处理,抽取各数据源需要的字段
        
        JavaPairDStream<String, WifiEntity> v2 = v1.mapToPair(new PairFunction<ConsumerRecord<String, String>, String, WifiEntity>() {
            @Override
            public Tuple2<String, WifiEntity> call(ConsumerRecord<String, String> v) throws Exception {
                WifiEntity wifiEntity = new WifiEntity();
                
                String[] splits = v.value().split(",", -1);
                //设置连接
                wifiEntity.setUserXXX_XXX(splits[0] + "^" + splits[9]);
                //手M 2
                wifiEntity.setUserXXX(splits[0]);
                //采集发生时间 1
                wifiEntity.setTime(splits[1]);
                //设备编号 3
                return new Tuple2(wifiEntity.getUsermac_wifimac(), wifiEntity);
            }
        });

        v2.foreachRDD(new VoidFunction<JavaPairRDD<String, WifiEntity>>() {
            @Override
            public void call(JavaPairRDD<String, WifiEntity> v3) throws Exception {
                //设置当前数据两个分区提交 避免数据单个分区,提交过大
                v3.repartition(2).foreachPartition(new VoidFunction<Iterator<Tuple2<String, WifiEntity>>>() {
                    @Override
                    public void call(Iterator<Tuple2<String, WifiEntity>> v4) {
                        TransportClient client = null;
                        System.setProperty("es.set.netty.runtime.available.processors", "false");
                        try {
                            client = new PreBuiltTransportClient(Settings.builder()
                                    .put("cluster.name", "集群名称")
                                    .put("client.transport.sniff", true)
                                    .put("thread_pool.search.size", 6)
                                    .build()).addTransportAddress(new TransportAddress(InetAddress.getByName("50.105.1.30"), 9300));
                        } catch (UnknownHostException e) {
                            e.printStackTrace();
                            ExceptionFlag = false;
                        }

                        BulkRequestBuilder builder = client.prepareBulk();
                        IndexRequestBuilder request = null;
                        HashMap<String, String> map = new HashMap<>();
                        WifiEntity wifiEntity = null;
                        while (v4.hasNext()) {
                            Tuple2<String, WifiEntity> next = v4.next();
                            wifiEntity = next._2();
                            map.put("XXXXXX", wifiEntity.getUsermac_wifimac());
                            map.put("XXXXXX", wifiEntity.getUsermac());
                            map.put("XXXXXX", wifiEntity.getDevice_number());
                            map.put("XXXXXX", wifiEntity.getWifimac());
                            map.put("XXXXXX", wifiEntity.getTime());
                            map.put("XXXXXX", wifiEntity.getXindao());
                            map.put("XXXXXX", wifiEntity.getXindaoqd());
                            request = client.prepareIndex("user_xxxx_v1", "doc", wifiEntity.getUsermac_XXXXXX()).setSource(map);
                            builder.add(request);
                        }
                        //判断当前人员的批量请求方法是否为空,如果为空直接不用请求;
                        if (0 != builder.numberOfActions()) {
                            builder.get();
                        }
                        client.close();
                    }
                });
            }
        });

        messages.foreachRDD(new VoidFunction<JavaRDD<ConsumerRecord<String, String>>>() {
            @Override
            public void call(JavaRDD<ConsumerRecord<String, String>> v3) throws Exception {
                v3.repartition(1).foreachPartition(new VoidFunction<Iterator<ConsumerRecord<String, String>>>() {
                    @Override
                    public void call(Iterator<ConsumerRecord<String, String>> st) throws Exception {
                        String time = com.util.DateUtil.formatDateString(new Date(), DateUtil.DATE_FORMAT_12W);
                        HashMap<String, String> redisMapOk = new HashMap<>();
                        HashMap<String, String> redisMapErro = new HashMap<>();
                        OffsetRange[] offsetRanges = ((HasOffsetRanges) v3.rdd()).offsetRanges();
                        for (OffsetRange offsetRange : offsetRanges) {
                            redisMapOk.put(String.valueOf(offsetRange.partition()), offsetRange.untilOffset()+"_"+time+"_OK");
                            redisMapErro.put(String.valueOf(offsetRange.partition()), offsetRange.fromOffset()+"_"+time+"_ERROR");
                        }
                        if(st.hasNext()){
                            if (ExceptionFlag) {
                                RedisUtil.PutAll(offset, redisMapOk, 1);
                            } else {
                                RedisUtil.PutAll(offset, redisMapErro, 1);
                            }
                        }
                    }
                });
            }
        });
        ssc.start();
        ssc.awaitTermination();
    }
}

POM坐标如下:

 <!-- spark start -->
        <dependency>
            <groupId>org.scala-lang</groupId>
            <artifactId>scala-library</artifactId>
            <version>2.11.8</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <version>2.3.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming_2.11</artifactId>
            <version>2.3.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
            <version>2.3.0</version>
        </dependency>
        <!-- spark end -->

        <!--hadoop hdfs start-->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>2.6.0</version>
        </dependency>
        <!--hadoop hdfs end-->

        <!--kafka 客户端 start-->
        <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka-clients</artifactId>
            <version>0.10.0.0</version>
        </dependency>
        <!--kafka 客户端 end-->

        <!--zk 客户端 start-->
        <dependency>
            <groupId>org.apache.zookeeper</groupId>
            <artifactId>zookeeper</artifactId>
            <version>3.4.5</version>
        </dependency>
        <!--zk 客户端 end-->

        <!--Hadoop公共组件包  start -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.6.0</version>
        </dependency>
        <!--Hadoop公共组件包  end -->

        <!--Hbase包  start -->
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>1.1.3</version>
        </dependency>
        <!--Hbase包  END -->
        <!--Hadoop客户端 start -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.6.0</version>
        </dependency>
        <!--Hadoop客户端 end-->

        <dependency>
            <groupId>redis.clients</groupId>
            <artifactId>jedis</artifactId>
            <version>3.0.1</version>
        </dependency>

        <!--Mysql start-->
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.47</version>
        </dependency>
        <!--Mysql end-->

        <!--FastJson start-->
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.23</version>
        </dependency>
        <!--FastJson end-->


        <dependency>
            <groupId>org.elasticsearch.client</groupId>
            <artifactId>transport</artifactId>
            <version>6.5.1</version>
        </dependency>

        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-api</artifactId>
            <version>1.7.21</version>
        </dependency>

        <dependency>
            <groupId>org.slf4j</groupId>
            <artifactId>slf4j-log4j12</artifactId>
            <version>1.7.21</version>
        </dependency>

        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.6</version>
        </dependency>

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
以下是使用Java编写的Spark Streaming获取Kafka数据的示例代码: ```java import org.apache.kafka.common.serialization.StringDeserializer; import org.apache.spark.SparkConf; import org.apache.spark.streaming.Durations; import org.apache.spark.streaming.api.java.JavaInputDStream; import org.apache.spark.streaming.api.java.JavaPairDStream; import org.apache.spark.streaming.api.java.JavaStreamingContext; import org.apache.spark.streaming.kafka010.ConsumerStrategies; import org.apache.spark.streaming.kafka010.KafkaUtils; import org.apache.spark.streaming.kafka010.LocationStrategies; import scala.Tuple2; import java.util.Arrays; import java.util.Collection; import java.util.HashMap; import java.util.Map; public class KafkaSparkStreamingExample { public static void main(String[] args) throws InterruptedException { // Kafka相关配置 String brokers = "localhost:9092"; String groupId = "test-group"; String topics = "test-topic"; // Spark相关配置 SparkConf conf = new SparkConf().setAppName("KafkaSparkStreamingExample").setMaster("local[*]"); JavaStreamingContext streamingContext = new JavaStreamingContext(conf, Durations.seconds(5)); // Kafka参数 Map<String, Object> kafkaParams = new HashMap<>(); kafkaParams.put("bootstrap.servers", brokers); kafkaParams.put("key.deserializer", StringDeserializer.class); kafkaParams.put("value.deserializer", StringDeserializer.class); kafkaParams.put("group.id", groupId); kafkaParams.put("auto.offset.reset", "latest"); kafkaParams.put("enable.auto.commit", false); // 订阅Kafka主题 Collection<String> topicsSet = Arrays.asList(topics.split(",")); JavaInputDStream<Tuple2<String, String>> messages = KafkaUtils.createDirectStream( streamingContext, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String>Subscribe(topicsSet, kafkaParams) ); // 处理消息 JavaPairDStream<String, String> pairs = messages.mapToPair(record -> new Tuple2<>(record.key(), record.value())); pairs.foreachRDD(rdd -> rdd.foreach(record -> System.out.println(record._2))); // 启动流处理 streamingContext.start(); streamingContext.awaitTermination(); } } ``` 在这个示例代码中,我们使用了`KafkaUtils.createDirectStream()`方法订阅了一个Kafka主题,然后使用`mapToPair()`方法将消息转换为键值对形式的RDD,最后使用`foreachRDD()`方法遍历RDD并打印出每条记录的值。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值