Kafka是一个分布式的发布-订阅式的消息系统,简单来说就是一个消息队列,好处是数据是持久化到磁盘的(本文重点不是介绍kafka,就不多说了)。Kafka的使用场景还是比较多的,比如用作异步系统间的缓冲队列,另外,在很多场景下,我们都会如如下的设计:将一些数据(比如日志)写入到kafka做持久化存储,然后另一个服务消费kafka中的数据,做业务级别的分析,然后将分析结果写入HBase或者HDFS;正因为这个设计很通用,所以像Storm这样的大数据流式处理框架已经支持与kafka的无缝连接。当然,作为后起之秀,Spark同样对kafka提供了原生的支持。
注:以上测试通过,可以根据需要修改。如有疑问,请留言!
本文要介绍的是Spark streaming + kafka的实战。
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>sprakStream</groupId>
<artifactId>sprakStream</artifactId>
<version>0.0.1-SNAPSHOT</version>
<dependencies>
<!-- jar依赖正确 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.0.0</version>
<scope>provided</scope>
</dependency>
<!-- jar依赖正确 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.0.0</version>
<scope>provided</scope>
</dependency>
<!-- jar依赖正确 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.0.0</version>
<scope>provided</scope>
</dependency>
<!-- jar依赖正确 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_2.11</artifactId>
<version>2.0.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>2.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>1.2.1</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>1.2.1</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>2.8.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.postgresql</groupId>
<artifactId>postgresql</artifactId>
<version>9.4-1202-jdbc4</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>net.sf.json-lib</groupId>
<artifactId>json-lib</artifactId>
<version>2.2.3</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-pool2</artifactId>
<version>2.2</version>
</dependency>
</dependencies>
<build>
<sourceDirectory>${basedir}/src/main/scala</sourceDirectory>
<testSourceDirectory>${basedir}/src/test/scala</testSourceDirectory>
<resources>
<resource>
<directory>${basedir}/src/main/resources</directory>
</resource>
</resources>
<testResources>
<testResource>
<directory>${basedir}/src/test/resources</directory>
</testResource>
</testResources>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.2</version>
<configuration>
<createDependencyReducedPom>true</createDependencyReducedPom>
</configuration>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<artifactSet>
<includes>
<include>*:*</include>
</includes>
</artifactSet>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
<transformer
implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
<resource>reference.conf</resource>
</transformer>
<transformer
implementation="org.apache.maven.plugins.shade.resource.DontIncludeResourceTransformer">
<resource>log4j.properties</resource>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
package com.sprakStream.demo
import java.util.Properties
import java.util.regex.Matcher
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.kafka.common.TopicPartition
import org.apache.spark.streaming.kafka010.ConsumerStrategies
import org.apache.spark.streaming.kafka010.LocationStrategies
import org.apache.spark.streaming.kafka010.HasOffsetRanges
import org.apache.spark.streaming.kafka010.OffsetRange
import org.apache.spark.TaskContext
import com.sprakStream.util.AppConstant
import com.sprakStream.bean.IpMapper
import com.sprakStream.util.CommUtil
import kafka.common.TopicAndPartition
import com.logger.util.LoggerUtil
object KafkaExampleOffset {
def main(args: Array[String]): Unit = {
//val conf = new SparkConf()
//val sc = new SparkContext()
//屋企的环境
// System.setProperty("spark.sql.warehouse.dir", "D:\\tools\\spark-2.0.0-bin-hadoop2.6");
// System.setProperty("hadoop.home.dir", "D:\\tools\\hadoop-2.6.0");
//公司的环境
System.setProperty("spark.sql.warehouse.dir", "D:\\DevelopTool\\spark-2.0.0-bin-hadoop2.6");
println("success to Init...")
val url = "jdbc:postgresql://172.16.12.190:5432/dataex_tmp"
val prop = new Properties()
prop.put("user", "postgres")
prop.put("password", "issing")
val conf = new SparkConf().setAppName("wordcount").setMaster("local")
val ssc = new StreamingContext(conf, Seconds(2))
val sparkSession = SparkSession.builder().config(conf).getOrCreate()
val util = Utilities;
util.setupLogging()
// Construct a regular expression (regex) to extract fields from raw Apache log lines
val pattern = util.apacheLogPattern()
// hostname:port for Kafka brokers, not Zookeeper
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> AppConstant.KAFKA_HOST,
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "example",
"enable.auto.commit" -> (false: java.lang.Boolean) //"auto.offset.reset" -> "latest",
// "auto.offset.reset" -> "largest" //自动将偏移重置为最新偏移(默认)
// "auto.offset.reset" -> "earliest" //自动将偏移重置为最早的偏移
// "auto.offset.reset" -> "none" //如果没有为消费者组找到以前的偏移,则向消费者抛出异常
)
// List of topics you want to listen for from Kafka
val topics = List(AppConstant.KAFKA_TOPIC).toSet
/**
* kafka offset
*/
/**
* 从指定位置开始读取kakfa数据
* 注意:由于Exactly Once的机制,所以任何情况下,数据只会被消费一次!
* 指定了开始的offset后,将会从上一次Streaming程序停止处,开始读取kafka数据
*/
//实验得出,当TopicPartition有被放到offsets中的时候,程序可以去消费,否则不消费;消费者消费的模式是按照分区,一个分区一个分区消费的
//2L:L表示long类型,2指从偏移值为2的消息开始消费
val offsets = Map[TopicPartition, Long](
new TopicPartition(AppConstant.KAFKA_TOPIC, 0) -> 5000L,
new TopicPartition(AppConstant.KAFKA_TOPIC, 1) -> 5000L,
new TopicPartition(AppConstant.KAFKA_TOPIC, 2) -> 5000L)
//通过KafkaUtils.createDirectStream(...)获得kafka数据,kafka相关参数由kafkaParams指定
val line = KafkaUtils.createDirectStream(
ssc,
PreferConsistent,
ConsumerStrategies.Subscribe[String, String](topics, kafkaParams, offsets));
//数据操作
line.foreachRDD(mess => {
//获取offset集合
val offsetsList = mess.asInstanceOf[HasOffsetRanges].offsetRanges
mess.foreachPartition(lines => {
lines.foreach(line => {
//println()
//println("---------------------------------------------------------------------------------------")
val o: OffsetRange = offsetsList(TaskContext.get.partitionId)
println("++++++++++++++++++++++++++++++此处记录offset+++++++++++++++++++++++++++++++++++++++")
//println("--topic::" + o.topic + "--partition:" + o.partition + "--fromOffset:" + o.fromOffset + "--untilOffset:" + o.untilOffset)
//println("+++++++++++++++++++++++++++++++此处消费数据操作++++++++++++++++++++++++++++++++++++++")
println("The kafka line is " + line)
LoggerUtil.loggerToBuffer(line.toString())
//println("---------------------------------------------------------------------------------------")
//println()
})
})
})
// Kick it off
ssc.checkpoint("/user/root/spark/checkpoint")
ssc.start()
ssc.awaitTermination()
println("KafkaExample-结束.................................")
}
}
object SQLContextSingleton2 {
@transient private var instance: SQLContext = _
def getInstance(sparkContext: SparkContext): SQLContext = {
if (instance == null) {
instance = new SQLContext(sparkContext)
}
instance
}
}
注:以上测试通过,可以根据需要修改。如有疑问,请留言!