kafka保存偏移量到zookeeper(实测可用)

最新推荐文章于 2024-06-22 11:09:29 发布

滚滚长江东逝矿泉水

最新推荐文章于 2024-06-22 11:09:29 发布

阅读量1.4k

点赞数

文章标签： spark hadoop kafka zookeeper scala

本文链接：https://blog.csdn.net/weixin_42896013/article/details/99854105

版权

kafka保存偏移量到zookeeper(实测可用)

1.pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
     xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
     xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
     
<modelVersion>4.0.0</modelVersion>
<groupId>1612</groupId>
<artifactId>spark</artifactId>
<version>2.0</version>
<properties>
    <maven.compiler.source>1.7</maven.compiler.source>
    <maven.compiler.target>1.7</maven.compiler.target>
    <encoding>UTF-8</encoding>
    <scala.version>2.11.8</scala.version>
    <spark.version>2.1.0</spark.version>
    <hadoop.version>2.6.4</hadoop.version>
</properties>
<dependencies>
    <dependency>
        <groupId>mysql</groupId>
        <artifactId>mysql-connector-java</artifactId>
        <version>5.1.38</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/junit/junit -->
    <dependency>
        <groupId>junit</groupId>
        <artifactId>junit</artifactId>
        <version>4.12</version>
        <scope>test</scope>
    </dependency>
    <dependency>
        <groupId>org.slf4j</groupId>
        <artifactId>slf4j-api</artifactId>
        <version>1.7.25</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.json/json -->
    <dependency>
        <groupId>org.json</groupId>
        <artifactId>json</artifactId>
        <version>20090211</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.mongodb/casbah -->
    <dependency>
        <groupId>org.scala-lang</groupId>
        <artifactId>scala-library</artifactId>
        <version>${scala.version}</version>
    </dependency>
    <dependency>
        <groupId>com.alibaba</groupId>
        <artifactId>fastjson</artifactId>
        <version>1.2.35</version>
    </dependency>
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-core_2.11</artifactId>
        <version>${spark.version}</version>
    </dependency>
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-streaming_2.11</artifactId>
        <version>${spark.version}</version>
    </dependency>
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-sql_2.11</artifactId>
        <version>${spark.version}</version>
    </dependency>
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-hive_2.11</artifactId>
        <version>${spark.version}</version>
    </dependency>
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-mllib_2.11</artifactId>
        <version>2.1.0</version>
    </dependency>
    <dependency>
        <groupId>net.sf.json-lib</groupId>
        <artifactId>json-lib</artifactId>
        <version>2.4</version>
        <classifier>jdk15</classifier><!--指定jdk版本-->
    </dependency>
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-streaming-kafka-0-8_2.11</artifactId>
        <version>2.1.0</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.kafka/kafka -->
    <dependency>
        <groupId>org.apache.kafka</groupId>
        <artifactId>kafka_2.11</artifactId>
        <version>0.8.2.1</version>
    </dependency>
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-streaming-flume_2.11</artifactId>
        <version>2.1.0</version>
    </dependency>
    <!-- 新增 -->
    <dependency>
        <groupId>org.scalatest</groupId>
        <artifactId>scalatest_2.10</artifactId>
        <version>2.1.0</version>
    </dependency>
</dependencies>
<build>
    <sourceDirectory>src/main/scala</sourceDirectory>
    <testSourceDirectory>src/test/scala</testSourceDirectory>
    <plugins>
        <plugin>
            <groupId>net.alchim31.maven</groupId>
            <artifactId>scala-maven-plugin</artifactId>
            <version>3.2.2</version>
            <executions>
                <execution>
                    <goals>
                        <goal>compile</goal>
                        <goal>testCompile</goal>
                    </goals>
                    <configuration>
                        <args>
                            <arg>-make:transitive</arg>
                            <arg>-dependencyfile</arg>
                            <arg>${project.build.directory}/.scala_dependencies</arg>
                        </args>
                    </configuration>
                </execution>
            </executions>
        </plugin>
        <plugin>
            <groupId>org.apache.maven.plugins</groupId>
            <artifactId>maven-shade-plugin</artifactId>
            <version>2.4.3</version>
            <executions>
                <execution>
                    <phase>package</phase>
                    <goals>
                        <goal>shade</goal>
                    </goals>
                    <configuration>
                        <filters>
                            <filter>
                                <artifact>*:*</artifact>
                                <excludes>
                                    <exclude>META-INF/*.SF</exclude>
                                    <exclude>META-INF/*.DSA</exclude>
                                    <exclude>META-INF/*.RSA</exclude>
                                </excludes>
                            </filter>
                        </filters>
                        <transformers>
                            <transformer
                                    implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                                <mainClass>WordCount</mainClass>
                            </transformer>
                        </transformers>
                    </configuration>
                </execution>
            </executions>
        </plugin>
        <plugin>
            <groupId>org.apache.maven.plugins</groupId>
            <artifactId>maven-compiler-plugin</artifactId>
            <version>3.2</version>
            <configuration>
                <source>1.8</source>
                <target>1.8</target>
            </configuration>
        </plugin>
    </plugins>
</build>
</project>

2.工具类

package spark1.gx

import java.text.SimpleDateFormat
import java.util.{Calendar, Date}
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import kafka.utils.{ZKGroupTopicDirs, ZkUtils}
import org.I0Itec.zkclient.ZkClient
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils, OffsetRange}

object othersUtil {
 // todo kafka保存偏移量到zookeeper
 def kafkaAndZookeeper(ssc: StreamingContext): DStream[String] = {

val group = "DirectAndZk"
val topic = "apkmsg"
val brokerList = "hadoop1:9092"
val zkQuorum = "hadoop1:2181,hadoop2:2181,hadoop3:2181"
val topics: Set[String] = Set(topic)
val topicDirs = new ZKGroupTopicDirs(group, topic)
val zkTopicPath = s"${topicDirs.consumerOffsetDir}"

val kafkaParams = Map(
  "metadata.broker.list" -> brokerList,
  "group.id" -> group,
  "auto.offset.reset" -> kafka.api.OffsetRequest.LargestTimeString
)

val zkClient = new ZkClient(zkQuorum)
val children = zkClient.countChildren(zkTopicPath)
var kafkaStream: InputDStream[(String, String)] = null
var fromOffsets: Map[TopicAndPartition, Long] = Map()

if (children > 0) {
  for (i <- 0 until children) {
    val partitionOffset = zkClient.readData[String](s"$zkTopicPath/${i}")
    val tp = TopicAndPartition(topic, i)
    fromOffsets += (tp -> partitionOffset.toLong)
  }
  val messageHandler = (mmd: MessageAndMetadata[String, String]) => (mmd.key(), mmd.message())
  kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParams, fromOffsets, messageHandler)
} else {
  kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics)
}

var offsetRanges = Array[OffsetRange]()
kafkaStream.foreachRDD { kafkaRDD =>
  offsetRanges = kafkaRDD.asInstanceOf[HasOffsetRanges].offsetRanges
  for (o <- offsetRanges) {
    val zkPath = s"${topicDirs.consumerOffsetDir}/${o.partition}"
    ZkUtils.updatePersistentPath(zkClient, zkPath, o.untilOffset.toString)
  }
}

val streamrdd = kafkaStream.map(_._2)
streamrdd
  }
}

3.主类

package spark1.gx

import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}

object kafkazookeeper {
 def main(args: Array[String]): Unit = {
Logger.getLogger("org").setLevel(Level.WARN)
val conf = new SparkConf().setAppName("KafkaDirectWordCount")
  .setMaster("local[6]")
val ssc = new StreamingContext(conf, Seconds(5))

othersUtil.kafkaAndZookeeper(ssc)
  .flatMap(_.split(" ")).map(x => (x, 1)).reduceByKey(_ + _)
  .foreachRDD(x => {
    println("****************************************")
    println(x.collect().mkString("\n"))
    println("****************************************")
  })

ssc.start()
ssc.awaitTermination()
  }
}

手动输入kafka源数据

369 963 666
5 5 5
6 6 6
0 0 0
5 5 5
6 6 6
0 0 0
zw zw zw zw
zw zw zw zw
55 55 55

结果

19/08/20 14:53:46 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform… using builtin-java classes where applicable
19/08/20 14:54:00 INFO utils.VerifiableProperties: Verifying properties
19/08/20 14:54:00 INFO utils.VerifiableProperties: Property auto.offset.reset is overridden to largest
19/08/20 14:54:00 INFO utils.VerifiableProperties: Property group.id is overridden to DirectAndZk
19/08/20 14:54:00 INFO utils.VerifiableProperties: Property zookeeper.connect is overridden to

(0,6)
(5,6)
(6,6)
(zw,8)
(55,3)
(963,1)
(666,1)
(369,1)