SparkStreaming实时在线ETL
描述:使用Scala和Java编程
1.和kafka整合,通过kafka生产数据,然后把数据传入sparkstreaming进行在线
ETL;
2.把ETL后的数据在传入kafka,通过kafka消费数据
3.通过Redis进行数据偏移量的管理,从而保证每次从正确位置消费数据,防止数
据丢失。
1.项目父依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<version>1.0-SNAPSHOT</version>
<properties>
<scala.version>2.11.8</scala.version>
<spark.version>2.2.2</spark.version>
<junit.version>4.12</junit.version>
</properties>
<modules>
<module>streaming</module>
</modules>
<dependencyManagement>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>${junit.version}</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>2.9.0</version>
</dependency>
</dependencies>
</dependencyManagement>
</project>
2.子模块依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>spark-parent</artifactId>
<groupId>com.bigdata</groupId>
<version>1.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>streaming</artifactId>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-8_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>com.bigdata</groupId>
<artifactId>common</artifactId>
<version>1.0-SNAPSHOT</version>
</dependency>
</dependencies>
</project>
3.工具类和配置文件
1.获取Redis对象-------
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
import java.io.IOException;
import java.util.Properties;
public class RedisUnitils {
static JedisPool pool;
static {
try {
Properties properties = new Properties();
properties.load(RedisUnitils.class.getClassLoader().getResourceAsStream("jedis.properties"));
String host = properties.getProperty("jedis.host");
Integer port = Integer.valueOf(properties.getProperty("jedis.port"));
JedisPoolConfig jedisConf = new JedisPoolConfig();
pool = new JedisPool(jedisConf,host,port);
}catch (IOException e){
e.printStackTrace();
}
}
public static Jedis getJedis(){
return pool.getResource();
}
public static void releaseJedis(Jedis jedis){
jedis.close();
}
}
2.Redis配置文件------------------------------------
jedis.host=**** #主机
jedis.port=6379 #端口号
3.对时间处理----------------------------------------
import org.apache.commons.lang3.time.FastDateFormat;
import java.text.ParseException;
import java.util.Date;
public class DataUtils {
private static FastDateFormat df1 = FastDateFormat.getInstance("yyyyMMddHHmmss");
private static FastDateFormat df2 = FastDateFormat.getInstance("yyyy-MM-dd HH:mm:ss");
public static String formDataTime(String datatime){
try {
Date srcdate = df1.parse(datatime);
return df2.format(srcdate);
}catch (ParseException e){
e.printStackTrace();
}
return null;
}
}
4.ETL—scala代码
1.常量------------------------------------------------
object FidelsConstants {
val INDEX_USER_ID = 1
val INDEX_TIME = 2
val INDEX_CLIENT_IP = 3
val INDEX_SERVER_IP = 4
val INDEX_CLIENT_PORT = 5
val INDEX_SERVER_PORT = 6
val INDEX_URL = 13
}
2.自定义kafka producer类,目的是把producer放到driver端
import java.util.Properties
import org.apache.kafka.clients.producer.KafkaProducer
class MyKafkaProducer[k,v](properties:Properties) extends KafkaProducer[k,v](properties) with Serializable {
}
3.使用redis管理offset
import scala.collection.JavaConversions._
import common.RedisUnitils
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{KafkaUtils, OffsetRange}
import scala.collection.mutable
object KafkaRedisManager {
def createMsg(ssc:StreamingContext,kafkaParams:Map[String,String],topics:Set[String]):InputDStream[(String,String)]={
var messages:InputDStream[(String,String)] = null
val fromoffset:Map[TopicAndPartition,Long] = getFromOffset(topics,kafkaParams("group.id"))
if(fromoffset.isEmpty){
messages=KafkaUtils.createDirectStream[String,String,StringDecoder,StringDecoder](ssc,kafkaParams,topics)
}else{
val messageHandler = (msgHandler:MessageAndMetadata[String,String]) =>(msgHandler.key(),msgHandler.message())
messages = KafkaUtils.createDirectStream[String,String,StringDecoder,StringDecoder,(String,String)](ssc,kafkaParams,fromoffset,messageHandler)
}
messages
}
def getFromOffset(topics1:Set[String],group:String):Map[TopicAndPartition,Long]={
val offsets = mutable.Map[TopicAndPartition,Long]()
val jedis = RedisUnitils.getJedis()
for(topic <- topics1){
val map = jedis.hgetAll(topic)
for((gp,offset)<- map){
val partition = gp.substring(gp.indexOf("|")+1).toInt
offsets.put(TopicAndPartition(topic,partition.toInt),offset.toLong)
}
}
RedisUnitils.releaseJedis(jedis)
offsets.toMap
}
def storeOffset(offsetRanges:Array[OffsetRange],group:String):Unit={
val jedis = RedisUnitils.getJedis
for(offsetRange <- offsetRanges){
val topic = offsetRange.topic
val partition = offsetRange.partition
val offset = offsetRange.untilOffset
val fields = s"${group}|${partition}"
jedis.hset(topic,fields,offset.toString)
}
RedisUnitils.releaseJedis(jedis)
}
}
3.ETL---------------------------------------------
import java.util.Properties
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.kafka.HasOffsetRanges
import org.apache.spark.streaming.{Seconds, StreamingContext}
object StreamingKafkaEtl {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.spark_project").setLevel(Level.WARN)
if(args == null || args.length<4){
System.exit(-1)
}
val conf = new SparkConf()
.setMaster("local")
.setAppName("StreaminWIthKafkaRedisOffset")
.set("spark.streaming.kafka.maxRatePerPartition","20")
val Array(batchInterval,source,target,group) = args
val ssc = new StreamingContext(conf,Seconds(batchInterval.toLong))
val kafkaParams = Map[String,String](
"bootstrap.servers" -> "hdp01:9092,hdp02:9092,hdp03:9092",
"group.id" -> group,
"auto.offset.reset" -> "smallest"
)
val topics = source.split(",").toSet
val messages = KafkaRedisManager.createMsg(ssc,kafkaParams,topics)
messages.foreachRDD((rdd,btime)=>{
if(!rdd.isEmpty() ){
println(rdd.count())
println("-----------------------------")
println(s"time是:${btime}")
processETL(rdd,target)
KafkaRedisManager.storeOffset(rdd.asInstanceOf[HasOffsetRanges].offsetRanges,group)
}
})
ssc.start()
ssc.awaitTermination()
}
def processETL(rdd:RDD[(String,String)],target:String):Unit={
val cleanedRdd:RDD[String] = rdd.map{
case (key,value)=>{
val fields = value.replaceAll("<<<!>>>","").split(",")
if(fields.length <15){
""
}else{
val userid = fields(FidelsConstants.INDEX_USER_ID)
val time = fields(FidelsConstants.INDEX_TIME)
val formatTime = DataUtils.formDataTime(time)
val clientip = fields(FidelsConstants.INDEX_CLIENT_IP)
val serverip = fields(FidelsConstants.INDEX_SERVER_IP)
val clientport = fields(FidelsConstants.INDEX_CLIENT_PORT)
val serverport = fields(FidelsConstants.INDEX_SERVER_PORT)
val url = fields(FidelsConstants.INDEX_URL)
s"${formatTime}##${userid}##${clientip}##${serverip}##${clientport}##${serverport}##${url}"
}
}
}.filter(_!="")
saveMethod3(cleanedRdd,target)
}
def saveMethod1(cleanedRdd:RDD[String],target:String): Unit ={
cleanedRdd.foreach(line =>{
val properties = new Properties()
properties.put("bootstrap.servers","hdp01:9092,hdp02:9092,hdp03:9092")
properties.put("acks","all")
properties.put("retries","0")
properties.put("batch.size","16384")
properties.put("buffer.memory","33554432")
properties.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
properties.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
val producer = new KafkaProducer[String,String](properties)
val record = new ProducerRecord[String,String](target,line)
producer.send(record)
producer.close()
})
}
def saveMethod2(cleanedRdd:RDD[String],target:String): Unit ={
cleanedRdd.foreachPartition(partition =>{
val properties = new Properties()
properties.put("bootstrap.servers","hdp01:9092,hdp02:9092,hdp03:9092")
properties.put("acks","all")
properties.put("retries","0")
properties.put("batch.size","16384")
properties.put("buffer.memory","33554432")
properties.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
properties.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
val producer = new KafkaProducer[String,String](properties)
partition.foreach(line =>{
val record = new ProducerRecord[String,String](target,line)
producer.send(record)
})
producer.close()
})
}
def saveMethod3(cleanedRdd:RDD[String],target:String): Unit ={
val properties = new Properties()
properties.put("bootstrap.servers","hdp01:9092,hdp02:9092,hdp03:9092")
properties.put("acks","all")
properties.put("retries","1")
properties.put("batch.size","16384")
properties.put("buffer.memory","33554432")
properties.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer")
properties.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer")
val producer = new MyKafkaProducer[String,String](properties)
val producerPbc = cleanedRdd.sparkContext.broadcast(producer)
cleanedRdd.foreachPartition(partition =>{
partition.foreach(line =>{
val record = new ProducerRecord[String,String](target,line)
producerPbc.value.send(record)
})
})
}
}