下列代码,涉及到数据Kafka接入,数据Spark算子数据处理,Kafka偏移量记录,数据反压,数据批量插入ElasticSearch等所有操作步骤。
package com.app;
import com.util.RedisUtil;
import org.apache.commons.lang.StringUtils;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka010.*;
import org.elasticsearch.action.bulk.BulkRequestBuilder;
import org.elasticsearch.action.index.IndexRequestBuilder;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.TransportAddress;
import org.elasticsearch.transport.client.PreBuiltTransportClient;
import scala.Tuple2;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.*;
public class RealTime_Data {
//设置异常标记,如果发生异常,则记录起始偏移量,如果正常 则记录解释偏移量
public static Boolean ExceptionFlag = true;
//配置kafka参数(节点、消费者组、topic)
private static String topics = "topics"; //指定topic
private static String groupId = "groupId_cs4"; //指定消费者组id
private static String offset = "offset"; //偏移量的Key
private static String brokers = "IP:9092,IP:9092,IP:9092";//指定kafka地
public static void main(String[] args) throws InterruptedException {
SparkConf conf = new SparkConf().setAppName("RealTime_Data");
conf.set("spark.dynamicAllocation.enabled", "false");
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
conf.set("spark.streaming.backpressure.enabled", "true"); //启用反压
conf.set("spark.streaming.backpressure.pid.minRate","100"); //最小条数
//当前调度最大消费条数=分区数*秒数*最大消费限制条数
conf.set("spark.streaming.kafka.maxRatePerPartition","2500"); //最大条数
conf.set("spark.speculation", "true");//开启资源动态调用
//设置三线程
conf.setMaster("local[3]");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaStreamingContext ssc = new JavaStreamingContext(sc, Durations.seconds(60));
Set<String> topicsSet = new HashSet<>(Arrays.asList(topics.split(",")));
Map<String, Object> kafkaParams = new HashMap<>();
kafkaParams.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers);
kafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG, groupId);
kafkaParams.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
kafkaParams.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
//每次程序启动获取最新的消费者偏移量
kafkaParams.put("auto.offset.reset", "latest");
//关闭消费之偏移量自动提交
kafkaParams.put("enable.auto.commit", "false");
HashMap<TopicPartition, Long> mapTopic = new HashMap<>();
JavaInputDStream<ConsumerRecord<String, String>> messages =null;
Boolean flag = RedisUtil.FlagExits(offset, 1);
if(flag){
Map<String, String> offsets = RedisUtil.getAll(offset, 1);
for (Map.Entry<String, String> entry : offsets.entrySet()) {
String partition = entry.getKey();
String offset = entry.getValue();
//截取去掉时间 只传入偏移量 避免程序错误
String[] s = offset.split("_", -1);
String offset_last = s[0];
TopicPartition topicPartition = new TopicPartition(topics, Integer.valueOf(partition));
mapTopic.put(topicPartition, Long.valueOf(offset_last));
}
messages = KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String>Subscribe(topicsSet, kafkaParams, mapTopic));
}else{
messages = KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String>Subscribe(topicsSet, kafkaParams));
}
JavaDStream<ConsumerRecord<String, String>> v1 = messages.filter(new Function<ConsumerRecord<String, String>, Boolean>() {
@Override
public Boolean call(ConsumerRecord<String, String> v1) throws Exception {
//判断 数据 //判断WF不为空 判断时间不为空
if (v1.key().startsWith("data") && StringUtils.isNotBlank(v1.value().split(",", -1)[9])) {
return true;
}
return false;
}
});
//数据规范处理,抽取各数据源需要的字段
JavaPairDStream<String, WifiEntity> v2 = v1.mapToPair(new PairFunction<ConsumerRecord<String, String>, String, WifiEntity>() {
@Override
public Tuple2<String, WifiEntity> call(ConsumerRecord<String, String> v) throws Exception {
WifiEntity wifiEntity = new WifiEntity();
String[] splits = v.value().split(",", -1);
//设置连接
wifiEntity.setUserXXX_XXX(splits[0] + "^" + splits[9]);
//手M 2
wifiEntity.setUserXXX(splits[0]);
//采集发生时间 1
wifiEntity.setTime(splits[1]);
//设备编号 3
return new Tuple2(wifiEntity.getUsermac_wifimac(), wifiEntity);
}
});
v2.foreachRDD(new VoidFunction<JavaPairRDD<String, WifiEntity>>() {
@Override
public void call(JavaPairRDD<String, WifiEntity> v3) throws Exception {
//设置当前数据两个分区提交 避免数据单个分区,提交过大
v3.repartition(2).foreachPartition(new VoidFunction<Iterator<Tuple2<String, WifiEntity>>>() {
@Override
public void call(Iterator<Tuple2<String, WifiEntity>> v4) {
TransportClient client = null;
System.setProperty("es.set.netty.runtime.available.processors", "false");
try {
client = new PreBuiltTransportClient(Settings.builder()
.put("cluster.name", "集群名称")
.put("client.transport.sniff", true)
.put("thread_pool.search.size", 6)
.build()).addTransportAddress(new TransportAddress(InetAddress.getByName("50.105.1.30"), 9300));
} catch (UnknownHostException e) {
e.printStackTrace();
ExceptionFlag = false;
}
BulkRequestBuilder builder = client.prepareBulk();
IndexRequestBuilder request = null;
HashMap<String, String> map = new HashMap<>();
WifiEntity wifiEntity = null;
while (v4.hasNext()) {
Tuple2<String, WifiEntity> next = v4.next();
wifiEntity = next._2();
map.put("XXXXXX", wifiEntity.getUsermac_wifimac());
map.put("XXXXXX", wifiEntity.getUsermac());
map.put("XXXXXX", wifiEntity.getDevice_number());
map.put("XXXXXX", wifiEntity.getWifimac());
map.put("XXXXXX", wifiEntity.getTime());
map.put("XXXXXX", wifiEntity.getXindao());
map.put("XXXXXX", wifiEntity.getXindaoqd());
request = client.prepareIndex("user_xxxx_v1", "doc", wifiEntity.getUsermac_XXXXXX()).setSource(map);
builder.add(request);
}
//判断当前人员的批量请求方法是否为空,如果为空直接不用请求;
if (0 != builder.numberOfActions()) {
builder.get();
}
client.close();
}
});
}
});
messages.foreachRDD(new VoidFunction<JavaRDD<ConsumerRecord<String, String>>>() {
@Override
public void call(JavaRDD<ConsumerRecord<String, String>> v3) throws Exception {
v3.repartition(1).foreachPartition(new VoidFunction<Iterator<ConsumerRecord<String, String>>>() {
@Override
public void call(Iterator<ConsumerRecord<String, String>> st) throws Exception {
String time = com.util.DateUtil.formatDateString(new Date(), DateUtil.DATE_FORMAT_12W);
HashMap<String, String> redisMapOk = new HashMap<>();
HashMap<String, String> redisMapErro = new HashMap<>();
OffsetRange[] offsetRanges = ((HasOffsetRanges) v3.rdd()).offsetRanges();
for (OffsetRange offsetRange : offsetRanges) {
redisMapOk.put(String.valueOf(offsetRange.partition()), offsetRange.untilOffset()+"_"+time+"_OK");
redisMapErro.put(String.valueOf(offsetRange.partition()), offsetRange.fromOffset()+"_"+time+"_ERROR");
}
if(st.hasNext()){
if (ExceptionFlag) {
RedisUtil.PutAll(offset, redisMapOk, 1);
} else {
RedisUtil.PutAll(offset, redisMapErro, 1);
}
}
}
});
}
});
ssc.start();
ssc.awaitTermination();
}
}
POM坐标如下:
<!-- spark start -->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.11.8</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>2.3.0</version>
</dependency>
<!-- spark end -->
<!--hadoop hdfs start-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.6.0</version>
</dependency>
<!--hadoop hdfs end-->
<!--kafka 客户端 start-->
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>0.10.0.0</version>
</dependency>
<!--kafka 客户端 end-->
<!--zk 客户端 start-->
<dependency>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
<version>3.4.5</version>
</dependency>
<!--zk 客户端 end-->
<!--Hadoop公共组件包 start -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.6.0</version>
</dependency>
<!--Hadoop公共组件包 end -->
<!--Hbase包 start -->
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>1.1.3</version>
</dependency>
<!--Hbase包 END -->
<!--Hadoop客户端 start -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.0</version>
</dependency>
<!--Hadoop客户端 end-->
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>3.0.1</version>
</dependency>
<!--Mysql start-->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.47</version>
</dependency>
<!--Mysql end-->
<!--FastJson start-->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.23</version>
</dependency>
<!--FastJson end-->
<dependency>
<groupId>org.elasticsearch.client</groupId>
<artifactId>transport</artifactId>
<version>6.5.1</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.21</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.21</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.6</version>
</dependency>