下列代码,涉及到数据Kafka接入,数据Spark算子数据处理,Kafka偏移量记录,数据反压,数据批量插入Hbase等所有操作步骤。
package com.data;
import com.alibaba.fastjson.JSON;
import com.entity.ImsiDataDTO;
import com.entity.MacImsiDataDTO;
import org.apache.commons.collections.IteratorUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka010.*;
import scala.Tuple2;
import com.util.DateUtil;
import com.util.RedisUtil;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.*;
/***
* 实时数据入库
* 三种数据源实时轨迹
* liuwunan
*/
public class RealTime_Track {
//设置异常标记,如果发生异常,则记录起始偏移量,如果正常 则记录解释偏移量
public static Boolean ExceptionFlag = true;
//配置kafka参数(节点、消费者组、topic)
private static String topics = "topic"; //指定topic
private static String groupId = "consumer_001";//指定消费者组id
private static String offset = "offset";
private static String brokers = "IP:9092,IP:9092,IP:9092";//指定kafka地
public static Properties prop = null;
static {
InputStream in = null;
try {
prop= new Properties();
//绝对路径
//in= new BufferedInputStream(new FileInputStream("/home/work_space/ElasticSearch/resource/resource.properties"));
//相对路径
in = HuaLong_MaiWaiDi_RealTime_Track.class.getResourceAsStream("/DianWei.properties");
prop.load(new InputStreamReader(in, "utf-8"));
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static void main(String[] args) throws InterruptedException {
//设置hadoop 文件备份为1,Hadoop 系统默认3份 减少数据同步延迟性
//Configuration hdfs = new Configuration();
//hdfs.set("dfs.replication", "1");
//初始化spark
SparkConf conf = new SparkConf().setAppName("HuaLong_MaiWaiDi_RealTime_Track")
.set("spark.dynamicAllocation.enabled", "false");
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
conf.set("spark.streaming.backpressure.enabled", "true");//启用反压
conf.set("spark.streaming.backpressure.pid.minRate", "1");//最小条数
conf.set("spark.streaming.kafka.maxRatePerPartition", "1000");//最大条数
//conf.setMaster("local[6]");
conf.set("spark.speculation", "true");//开启资源动态调用
JavaSparkContext sc = new JavaSparkContext(conf);
//初始化sparkStreaming对象
JavaStreamingContext ssc = new JavaStreamingContext(sc, Durations.seconds(60));
Set<String> topicsSet = new HashSet<>(Arrays.asList(topics.split(",")));
Map<String, Object> kafkaParams = new HashMap<>();
kafkaParams.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers);
kafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG, groupId);
kafkaParams.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
kafkaParams.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
//每次程序启动获取最新的消费者偏移量
kafkaParams.put("auto.offset.reset", "latest");
//关闭消费之偏移量自动提交
kafkaParams.put("enable.auto.commit", "false");
HashMap<TopicPartition, Long> mapTopic = new HashMap<>();
JavaInputDStream<ConsumerRecord<String, String>> messages = null;
Boolean flag = RedisUtil.FlagExits(offset, 1);
if (flag) {
Map<String, String> offsets = RedisUtil.getAll(hl_mwd_offset, 1);
for (Map.Entry<String, String> entry : offsets.entrySet()) {
String partition = entry.getKey();
String offset = entry.getValue();
//截取去掉时间 只传入偏移量 避免程序错误
String[] s = offset.split("_", -1);
String offset_last = s[0];
TopicPartition topicPartition = new TopicPartition(topics, Integer.valueOf(partition));
mapTopic.put(topicPartition, Long.valueOf(offset_last));
}
//链接kafka,获得DStream对象
messages = KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String>Subscribe(topicsSet, kafkaParams, mapTopic));
} else {
System.out.println("重头消费 最新消费");
messages = KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String>Subscribe(topicsSet, kafkaParams));
}
//过滤数据 mt_data DATA_MAC BASIC_REALIDTRACK BASIC_NETIDTRACK
JavaDStream<ConsumerRecord<String, String>> filter = messages.filter(new Function<ConsumerRecord<String, String>, Boolean>() {
@Override
public Boolean call(ConsumerRecord<String, String> v1) throws Exception {
//到时候需要拼接数据
return v1.key().startsWith("data")
|| v1.key().startsWith("DATA")
|| v1.key().startsWith("BASIC")
|| v1.key().startsWith("BASIC_NET");
}
});
//数据规范处理,抽取各数据源需要的字段
JavaPairDStream<String, MacImsiDataDTO> dataAll = filter.mapToPair(new PairFunction<ConsumerRecord<String, String>, String, MacImsiDataDTO>() {
@Override
public Tuple2<String, MacImsiDataDTO> call(ConsumerRecord<String, String> v) throws Exception {
String[] splits = null;
MacImsiDataDTO personInfo = new MacImsiDataDTO();
if (v.key().startsWith("mt_data")) {
splits = v.value().split(",", -1);
//String Redis_HL_Device = RedisUtil.get("HH_Device_" + splits[2], 3);
String Redis_HL_Device = prop.getProperty("HH_Device_" + splits[2]);
//采集发生时间 1
personInfo.setCaptureTime(splits[1]);
//手机mac 2
personInfo.setPersonMac(splits[0]);
//设备编号 3
personInfo.setDeviceNum(splits[2]);
//设备地址 4
StringBuffer addr = new StringBuffer("");
if (null != Redis_HH_Device) {
addr.append(Redis_HL_Device.split(",", -1)[3]);
}
personInfo.setAddress(addr.toString());
//经度 5
personInfo.setLng(splits[8]);
//纬度 6
personInfo.setLat(splits[7]);
//设置数据源 7
personInfo.setDataSource("HH");
} else if (v.key().startsWith("DATA_MAC")) {
splits = v.value().split("\t", -1);
//Redis中所有的数据
/*String Redis_MMD_Unit = RedisUtil.get("MMD_Unit_" + splits[2], 3);
String Redis_MWD_Device_M = RedisUtil.get("MMD_Device_M" + splits[1], 3);*/
String Redis_MWD_Unit = prop.getProperty("MMD_Unit_" + splits[2]);
String Redis_MWD_Device_Mac = prop.getProperty("MMD_Device_M" + splits[1]);
//采集时间 1
personInfo.setCaptureTime(splits[3] + "000");
//设置M 2
personInfo.setPersonMac(splits[0]);
//设置编号 3
StringBuffer DeviceNumber = new StringBuffer("");
if (null != Redis_MWD_Device_M) {
DeviceNumber.append(Redis_MMD_Device_M.split("\t", -1)[3]);
}
personInfo.setDeviceNum(DeviceNumber.toString());
//纬度 4
StringBuffer Lng = new StringBuffer("");
//经度 5
StringBuffer Lat = new StringBuffer("");
if (Redis_MWD_Unit != null) {
Lng.append(Redis_MMD_Unit.split("\t", -1)[18]);
Lat.append(Redis_MMD_Unit.split("\t", -1)[20]);
}
personInfo.setLng(Lng.toString());
personInfo.setLat(Lat.toString());
//设置数据源类型 6
personInfo.setDataSource("MMD");
//设置地址 7
StringBuffer addr = new StringBuffer("");
if (null != Redis_MMD_Unit) {
addr.append(Redis_MMD_Unit.split("\t", -1)[6]);
addr.append("(");
addr.append(Redis_MMD_Unit.split("\t", -1)[13]);
addr.append(")");
}
personInfo.setAddress(addr.toString());
} else if (v.key().startsWith("BASIC_R")) {
splits = v.value().split("\t", -1);
//服务场所编码
if (StringUtils.isBlank(splits[16])) {
return null;
}
//String Redis_MWD_Unit = RedisUtil.get("MMD_Unit_" + splits[16], 3);
String Redis_MWD_Unit = prop.getProperty("MMD_Unit_" + splits[16]);
//设置地址
StringBuffer addr = new StringBuffer("");
if (null != Redis_MMD_Unit) {
addr.append(Redis_MMD_Unit.split("\t", -1)[6]);
addr.append("(");
addr.append(Redis_MMD_Unit.split("\t", -1)[13]);
addr.append(")");
addr.append("采集");
}
//设置时间 1
personInfo.setCaptureTime(splits[21] + "000");
//设置人员 2
personInfo.setPersonM(splits[14]);
//设置Number 3
personInfo.setDeviceNum(splits[22]);
//设置SB地址 4
personInfo.setAddress(addr.toString());
//设置类型 5
personInfo.setDataSource("MMD");
//设置经度、纬度 6 7
String Lng = "";
String Lat = "";
if (Redis_MMD_Unit != null) {
Lng = Redis_MMD_Unit.split("\t", -1)[18];
Lat = Redis_MMD_Unit.split("\t", -1)[20];
}
personInfo.setLng(Lng);
personInfo.setLat(Lat);
} else if (v.key().startsWith("BASIC_NETIDTRACK")) {
splits = v.value().split("\t", -1);
if (StringUtils.isBlank(splits[6])) {
return null;
}
//redis 所有信息
//String Redis_MMD_Unit = RedisUtil.get("MMD_Unit_" + splits[6], 3);
String Redis_MMD_Unit = prop.getProperty("MMD_Unit_" + splits[6]);
//采集时间 1
personInfo.setCaptureTime(splits[17] + "000");
//人员 2
personInfo.setPersonM(splits[14]);
//设置设备编号 3
personInfo.setDeviceNum(splits[18]);
//设置SJ源 4
personInfo.setDataSource("MMD");
//设置地址 5
StringBuilder addr = new StringBuilder("");
if (null != Redis_MMD_Unit) {
addr.append(Redis_MMD_Unit.split("\t", -1)[6]);
addr.append("(");
addr.append(Redis_MMD_Unit.split("\t", -1)[13]);
addr.append(")");
addr.append("LWCJ");
}
personInfo.setAddress(addr.toString());
//设置经度、纬度 6 7
StringBuilder Lng = new StringBuilder("");
//纬度
StringBuilder Lat = new StringBuilder("");
if (Redis_MWD_Unit != null) {
Lng.append(Redis_MWD_Unit.split("\t", -1)[18]);
Lat.append(Redis_MWD_Unit.split("\t", -1)[20]);
}
personInfo.setLng(Lng.toString());
personInfo.setLat(Lat.toString());
}
return new Tuple2(personInfo.getPersonM(), personInfo);
}
}).filter(x -> {
if (StringUtils.isBlank(x._2.getCaptureTime()) || StringUtils.isBlank(x._2.getPersonM())) {
return false;
}
if (null == x || null == x._2) {
return false;
}
return true;
});
JavaPairDStream<String, Iterable<MAISDataDTO>> MRst = dataAll.groupByKey();
//简易批量提交方法
MRst.foreachRDD(new VoidFunction<JavaPairRDD<String, Iterable<MacImsiDataDTO>>>() {
@Override
public void call(JavaPairRDD<String, Iterable<MacImsiDataDTO>> st) {
st.repartition(1).foreachPartition(new VoidFunction<Iterator<Tuple2<String, Iterable<MacImsiDataDTO>>>>() {
//获取连接
@Override
public void call(Iterator<Tuple2<String, Iterable<MacImsiDataDTO>>> data) throws Exception {
Configuration conf = HBaseConfiguration.create();
conf.set("hbase.zookeeper.quorum", "IP,IP,IP");
conf.set("hbase.zookeeper.property.clientPort", "2181");
conf.setInt("hbase.rbc.timeout", 300000);
conf.setInt("hbase.client.scanner.timeout.period", 300000);
conf.setInt("hbase.client.operation.timeout", 300000);
conf.setInt("hbase.client.scanner.caching", 30000);
Connection conn = null;
BufferedMutator table2 = null;
ArrayList<Mutation> mutations = null;
try {
conn = ConnectionFactory.createConnection(conf);
table2 = conn.getBufferedMutator(TableName.valueOf("hbase_track"));
mutations = new ArrayList<>();
while (data.hasNext()) {
Tuple2<String, Iterable<MacImsiDataDTO>> next = data.next();
List<ImsiDataDTO> list = IteratorUtils.toList(next._2.iterator());
byte[] bytes = JSON.toJSONBytes(list);
StringBuilder rowKey = new StringBuilder();
Date time = new Date();
//拼接Rowkey
rowKey.append(next._1()).append("_").append(DateUtil.formatDateString(time, "yyyyMMddHHmm"));
//设置Rowkey
Put put = new Put(Bytes.toBytes(rowKey.toString()));
//添加列簇,字段,字段对应的值
put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("val"), bytes);
mutations.add(put);
if (mutations.size() == 10000) {
table2.mutate(mutations);
table2.flush();
mutations.clear();
}
}
table2.mutate(mutations);
table2.flush();
} catch (Exception e) {
//如果发生异常将 标记切换 解决 数据重复问题
ExceptionFlag = false;
e.printStackTrace();
throw new RuntimeException(e);
} finally {
table2.close();
conn.close();
}
}
});
}
});
//上下两种方案都可行,均为Hbase批量提交方案
/*MacRst.foreachRDD(new VoidFunction<JavaPairRDD<String, Iterable<MacImsiDataDTO>>>() {
@Override
public void call(JavaPairRDD<String, Iterable<MacImsiDataDTO>> st) {
System.err.println("开始运算");
st.repartition(1).foreachPartition(new VoidFunction<Iterator<Tuple2<String, Iterable<MacImsiDataDTO>>>>() {
//获取连接
@Override
public void call(Iterator<Tuple2<String, Iterable<MacImsiDataDTO>>> data) throws Exception {
System.err.println("开始运算1");
long s = System.currentTimeMillis();
Configuration conf = HBaseConfiguration.create();
conf.set("hbase.zookeeper.quorum", "IP,IP,IP");
conf.set("hbase.zookeeper.property.clientPort", "2181");
conf.setInt("hbase.rbc.timeout", 300000);
conf.setInt("hbase.client.scanner.timeout.period", 300000);
conf.setInt("hbase.client.operation.timeout", 300000);
conf.setInt("hbase.client.scanner.caching", 30000);
Connection conn = null;
Table table = null;
ArrayList<Put> puts = null;
ArrayList<String> size = new ArrayList<>();
try {
conn = ConnectionFactory.createConnection(conf);
table = conn.getTable(TableName.valueOf("hbase_track_m"));
puts = new ArrayList<Put>();
while (data.hasNext()) {
Tuple2<String, Iterable<MiDataDTO>> next = data.next();
List<ImsiDataDTO> list = IteratorUtils.toList(next._2.iterator());
byte[] bytes = JSON.toJSONBytes(list);
StringBuilder rowKey = new StringBuilder();
Date time = new Date();
//拼接Rowkey
rowKey.append(next._1()).append("_").append(DateUtil.formatDateString(time, "yyyyMMddHHmm"));
//设置Rowkey
Put put = new Put(Bytes.toBytes(rowKey.toString()));
//添加列簇,字段,字段对应的值
put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("val"), bytes);
puts.add(put);
}
table.put(puts);
} catch (Exception e) {
//如果发生异常将 标记切换 解决 数据重复问题
ExceptionFlag = false;
e.printStackTrace();
throw new RuntimeException(e);
} finally {
table.close();
conn.close();
}
}
});
}
});*/
messages.foreachRDD(new VoidFunction<JavaRDD<ConsumerRecord<String, String>>>() {
@Override
public void call(JavaRDD<ConsumerRecord<String, String>> v3) throws Exception {
v3.repartition(1).foreachPartition(new VoidFunction<Iterator<ConsumerRecord<String, String>>>() {
@Override
public void call(Iterator<ConsumerRecord<String, String>> st) throws Exception {
String time = com.util.DateUtil.formatDateString(new Date(), DateUtil.DATE_FORMAT_12W);
HashMap<String, String> redisMapOk = new HashMap<>();
HashMap<String, String> redisMapErro = new HashMap<>();
OffsetRange[] offsetRanges = ((HasOffsetRanges) v3.rdd()).offsetRanges();
for (OffsetRange offsetRange : offsetRanges) {
//将时间拼接 添加到redis 数据库可以准备记录当前数据 偏移量消费的时间
//记录正确的偏移量 如果没有发生错误,则记录当前偏移量的结束位置,因为起始位置已经数据入库, 下次从上次的结束开始
redisMapOk.put(String.valueOf(offsetRange.partition()), offsetRange.untilOffset() + "_" + time + "_OK");
//记录错误的的偏移量 因为异常插入, 所以记录当前偏移量的起始位置。
redisMapErro.put(String.valueOf(offsetRange.partition()), offsetRange.fromOffset() + "_" + time + "_ERROR");
}
//当数据为空 不对数据添加到redis 然后减少redis 的压力
if (st.hasNext()) {
if (ExceptionFlag) {
RedisUtil.PutAll(hl_mwd_offset, redisMapOk, 1);
} else {
RedisUtil.PutAll(hl_mwd_offset, redisMapErro, 1);
}
}
}
});
}
});
ssc.start();
ssc.awaitTermination();
}
}
上述代码的POM坐标如下:
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<java.version>1.8</java.version>
</properties>
<dependencies>
<!-- spark start -->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.11.8</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>2.3.0</version>
</dependency>
<!-- spark end -->
<!--hadoop hdfs start-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.6.0</version>
</dependency>
<!--hadoop hdfs end-->
<!--kafka 客户端 start-->
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>0.10.0.0</version>
</dependency>
<!--kafka 客户端 end-->
<!--zk 客户端 start-->
<dependency>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
<version>3.4.5</version>
</dependency>
<!--zk 客户端 end-->
<!--Hadoop公共组件包 start -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.6.0</version>
</dependency>
<!--Hadoop公共组件包 end -->
<!--Hbase包 start -->
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>1.1.3</version>
</dependency>
<!--Hbase包 END -->
<!--Hadoop客户端 start -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.0</version>
</dependency>
<!--Hadoop客户端 end-->
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>3.0.1</version>
</dependency>
<!--Mysql start-->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.47</version>
</dependency>
<!--Mysql end-->
<!--FastJson start-->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.23</version>
</dependency>
<!--FastJson end-->