因为集群比较老,选用的spark为2.1.3版本,消费kafka时发生数据积压现象,在调试过程中,调大启动参数spark.streaming.concurrentJobs发现会报消费者不安全异常,查了好久资料,是官方源码问题,需要改动源码加上线程id,改动的类有两个,代码如下
CachedKafkaConsumer
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.streaming.kafka010
import java.{util => ju}
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord, KafkaConsumer}
import org.apache.kafka.common.{KafkaException, TopicPartition}
import org.apache.spark.internal.Logging
/**
* Consumer of single topicpartition, intended for cached reuse.
* Underlying consumer is not threadsafe, so neither is this,
* but processing the same topicpartition and group id in multiple threads is usually bad anyway.
*/
private[kafka010]
class CachedKafkaConsumer[K, V] private(
val groupId: String,
val topic: String,
val partition: Int,
val kafkaParams: ju.Map[String, Object]) extends Logging {
assert(groupId == kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG),
"groupId used for cache key must match the groupId in kafkaParams")
val topicPartition = new TopicPartition(topic, partition)
protected val consumer = {
val c = new KafkaConsumer[K, V](kafkaParams)
val tps = new ju.ArrayList[TopicPartition]()
tps.add(topicPartition)
c.assign(tps)
c
}
// TODO if the buffer was kept around as a random-access structure,
// could possibly optimize re-calculating of an RDD in the same batch
protected var buffer = ju.Collections.emptyList[ConsumerRecord[K, V]]().iterator
protected var nextOffset = -2L
def close(): Unit = consumer.close()
/**
* Get the record for the given offset, waiting up to timeout ms if IO is necessary.
* Sequential forward access will use buffers, but random access will be horribly inefficient.
*/
def get(offset: Long, timeout: Long): ConsumerRecord[K, V] = {
logDebug(s"Get $groupId $topic $partition nextOffset $nextOffset requested $offset")
if (offset != nextOffset) {
logInfo(s"Initial fetch for $groupId $topic $partition $offset")
seek(offset)
poll(timeout)
}
if (!buffer.hasNext()) { poll(timeout) }
assert(buffer.hasNext(),
s"Failed to get records for $groupId $topic $partition $offset after polling for $timeout")
var record = buffer.next()
if (record.offset != offset) {
logInfo(s"Buffer miss for $groupId $topic $partition $offset")
seek(offset)
poll(timeout)
assert(buffer.hasNext(),
s"Failed to get records for $groupId $topic $partition $offset after polling for $timeout")
record = buffer.next()
assert(record.offset == offset,
s"Got wrong record for $groupId $topic $partition even after seeking to offset $offset")
}
nextOffset = offset + 1
record
}
private def seek(offset: Long): Unit = {
logDebug(s"Seeking to $topicPartition $offset")
consumer.seek(topicPartition, offset)
}
private def poll(timeout: Long): Unit = {
val p = consumer.poll(timeout)
val r = p.records(topicPartition)
logDebug(s"Polled ${p.partitions()} ${r.size}")
buffer = r.iterator
}
}
private[kafka010]
object CachedKafkaConsumer extends Logging {
// private case class CacheKey(groupId: String, topic: String, partition: Int)
private case class CacheKey(groupId: String, topic: String, partition: Int, threadId: Long)
// Don't want to depend on guava, don't want a cleanup thread, use a simple LinkedHashMap
private var cache: ju.LinkedHashMap[CacheKey, CachedKafkaConsumer[_, _]] = null
/** Must be called before get, once per JVM, to configure the cache. Further calls are ignored */
def init(
initialCapacity: Int,
maxCapacity: Int,
loadFactor: Float): Unit = CachedKafkaConsumer.synchronized {
if (null == cache) {
logInfo(s"Initializing cache $initialCapacity $maxCapacity $loadFactor")
cache = new ju.LinkedHashMap[CacheKey, CachedKafkaConsumer[_, _]](
initialCapacity, loadFactor, true) {
override def removeEldestEntry(
entry: ju.Map.Entry[CacheKey, CachedKafkaConsumer[_, _]]): Boolean = {
if (this.size > maxCapacity) {
try {
entry.getValue.consumer.close()
} catch {
case x: KafkaException =>
logError("Error closing oldest Kafka consumer", x)
}
true
} else {
false
}
}
}
}
}
/**
* Get a cached consumer for groupId, assigned to topic and partition.
* If matching consumer doesn't already exist, will be created using kafkaParams.
*/
def get[K, V](
groupId: String,
topic: String,
partition: Int,
threadId: Long,
kafkaParams: ju.Map[String, Object]): CachedKafkaConsumer[K, V] =
CachedKafkaConsumer.synchronized {
// val k = CacheKey(groupId, topic, partition)
val k = CacheKey(groupId, topic, partition, threadId)
val v = cache.get(k)
if (null == v) {
logInfo(s"Cache miss for $k")
logDebug(cache.keySet.toString)
val c = new CachedKafkaConsumer[K, V](groupId, topic, partition, kafkaParams)
cache.put(k, c)
c
} else {
// any given topicpartition should have a consistent key and value type
v.asInstanceOf[CachedKafkaConsumer[K, V]]
}
}
/**
* Get a fresh new instance, unassociated with the global cache.
* Caller is responsible for closing
*/
def getUncached[K, V](
groupId: String,
topic: String,
partition: Int,
kafkaParams: ju.Map[String, Object]): CachedKafkaConsumer[K, V] =
new CachedKafkaConsumer[K, V](groupId, topic, partition, kafkaParams)
/** remove consumer for given groupId, topic, and partition, if it exists */
// def remove(groupId: String, topic: String, partition: Int): Unit = {
// val k = CacheKey(groupId, topic, partition)
def remove(groupId: String, topic: String, partition: Int, threadId: Long): Unit = {
val k = CacheKey(groupId, topic, partition, threadId)
logInfo(s"Removing $k from cache")
val v = CachedKafkaConsumer.synchronized {
cache.remove(k)
}
if (null != v) {
v.close()
logInfo(s"Removed $k from cache")
}
}
}
KafkaRDD
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.streaming.kafka010
import java.{util => ju}
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.kafka.common.TopicPartition
import org.apache.spark.internal.Logging
import org.apache.spark.partial.{BoundedDouble, PartialResult}
import org.apache.spark.rdd.RDD
import org.apache.spark.scheduler.ExecutorCacheTaskLocation
import org.apache.spark.storage.StorageLevel
import org.apache.spark.{Partition, SparkContext, TaskContext}
import scala.collection.mutable.ArrayBuffer
private[spark] class KafkaRDD[K, V](
sc: SparkContext,
val kafkaParams: ju.Map[String, Object],
val offsetRanges: Array[OffsetRange],
val preferredHosts: ju.Map[TopicPartition, String],
useConsumerCache: Boolean
) extends RDD[ConsumerRecord[K, V]](sc, Nil) with Logging with HasOffsetRanges {
assert("none" ==
kafkaParams.get(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG).asInstanceOf[String],
ConsumerConfig.AUTO_OFFSET_RESET_CONFIG +
" must be set to none for executor kafka params, else messages may not match offsetRange")
assert(false ==
kafkaParams.get(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG).asInstanceOf[Boolean],
ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG +
" must be set to false for executor kafka params, else offsets may commit before processing")
// TODO is it necessary to have separate configs for initial poll time vs ongoing poll time?
private val pollTimeout = conf.getLong("spark.streaming.kafka.consumer.poll.ms",
conf.getTimeAsMs("spark.network.timeout", "120s"))
private val cacheInitialCapacity =
conf.getInt("spark.streaming.kafka.consumer.cache.initialCapacity", 16)
private val cacheMaxCapacity =
conf.getInt("spark.streaming.kafka.consumer.cache.maxCapacity", 64)
private val cacheLoadFactor =
conf.getDouble("spark.streaming.kafka.consumer.cache.loadFactor", 0.75).toFloat
override def persist(newLevel: StorageLevel): this.type = {
logError("Kafka ConsumerRecord is not serializable. " +
"Use .map to extract fields before calling .persist or .window")
super.persist(newLevel)
}
override def getPartitions: Array[Partition] = {
offsetRanges.zipWithIndex.map { case (o, i) =>
new KafkaRDDPartition(i, o.topic, o.partition, o.fromOffset, o.untilOffset)
}.toArray
}
override def count(): Long = offsetRanges.map(_.count).sum
override def countApprox(
timeout: Long,
confidence: Double = 0.95
): PartialResult[BoundedDouble] = {
val c = count
new PartialResult(new BoundedDouble(c, 1.0, c, c), true)
}
override def isEmpty(): Boolean = count == 0L
override def take(num: Int): Array[ConsumerRecord[K, V]] = {
val nonEmptyPartitions = this.partitions
.map(_.asInstanceOf[KafkaRDDPartition])
.filter(_.count > 0)
if (num < 1 || nonEmptyPartitions.isEmpty) {
return new Array[ConsumerRecord[K, V]](0)
}
// Determine in advance how many messages need to be taken from each partition
val parts = nonEmptyPartitions.foldLeft(Map[Int, Int]()) { (result, part) =>
val remain = num - result.values.sum
if (remain > 0) {
val taken = Math.min(remain, part.count)
result + (part.index -> taken.toInt)
} else {
result
}
}
val buf = new ArrayBuffer[ConsumerRecord[K, V]]
val res = context.runJob(
this,
(tc: TaskContext, it: Iterator[ConsumerRecord[K, V]]) =>
it.take(parts(tc.partitionId)).toArray, parts.keys.toArray
)
res.foreach(buf ++= _)
buf.toArray
}
private def executors(): Array[ExecutorCacheTaskLocation] = {
val bm = sparkContext.env.blockManager
bm.master.getPeers(bm.blockManagerId).toArray
.map(x => ExecutorCacheTaskLocation(x.host, x.executorId))
.sortWith(compareExecutors)
}
protected[kafka010] def compareExecutors(
a: ExecutorCacheTaskLocation,
b: ExecutorCacheTaskLocation): Boolean =
if (a.host == b.host) {
a.executorId > b.executorId
} else {
a.host > b.host
}
/**
* Non-negative modulus, from java 8 math
*/
private def floorMod(a: Int, b: Int): Int = ((a % b) + b) % b
override def getPreferredLocations(thePart: Partition): Seq[String] = {
// The intention is best-effort consistent executor for a given topicpartition,
// so that caching consumers can be effective.
// TODO what about hosts specified by ip vs name
val part = thePart.asInstanceOf[KafkaRDDPartition]
val allExecs = executors()
val tp = part.topicPartition
val prefHost = preferredHosts.get(tp)
val prefExecs = if (null == prefHost) allExecs else allExecs.filter(_.host == prefHost)
val execs = if (prefExecs.isEmpty) allExecs else prefExecs
if (execs.isEmpty) {
Seq()
} else {
// execs is sorted, tp.hashCode depends only on topic and partition, so consistent index
val index = this.floorMod(tp.hashCode, execs.length)
val chosen = execs(index)
Seq(chosen.toString)
}
}
private def errBeginAfterEnd(part: KafkaRDDPartition): String =
s"Beginning offset ${part.fromOffset} is after the ending offset ${part.untilOffset} " +
s"for topic ${part.topic} partition ${part.partition}. " +
"You either provided an invalid fromOffset, or the Kafka topic has been damaged"
override def compute(thePart: Partition, context: TaskContext): Iterator[ConsumerRecord[K, V]] = {
val part = thePart.asInstanceOf[KafkaRDDPartition]
assert(part.fromOffset <= part.untilOffset, errBeginAfterEnd(part))
if (part.fromOffset == part.untilOffset) {
logInfo(s"Beginning offset ${part.fromOffset} is the same as ending offset " +
s"skipping ${part.topic} ${part.partition}")
Iterator.empty
} else {
new KafkaRDDIterator(part, context)
}
}
/**
* An iterator that fetches messages directly from Kafka for the offsets in partition.
* Uses a cached consumer where possible to take advantage of prefetching
*/
private class KafkaRDDIterator(
part: KafkaRDDPartition,
context: TaskContext) extends Iterator[ConsumerRecord[K, V]] {
logInfo(s"Computing topic ${part.topic}, partition ${part.partition} " +
s"offsets ${part.fromOffset} -> ${part.untilOffset}")
val threadId = Thread.currentThread().getId
val groupId = kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG).asInstanceOf[String]
context.addTaskCompletionListener{ context => closeIfNeeded() }
val consumer = if (useConsumerCache) {
CachedKafkaConsumer.init(cacheInitialCapacity, cacheMaxCapacity, cacheLoadFactor)
if (context.attemptNumber >= 1) {
// just in case the prior attempt failures were cache related
CachedKafkaConsumer.remove(groupId, part.topic, part.partition,threadId)
}
CachedKafkaConsumer.get[K, V](groupId, part.topic, part.partition, threadId,kafkaParams)
} else {
CachedKafkaConsumer.getUncached[K, V](groupId, part.topic, part.partition, kafkaParams)
}
var requestOffset = part.fromOffset
def closeIfNeeded(): Unit = {
if (!useConsumerCache && consumer != null) {
consumer.close
}
}
override def hasNext(): Boolean = requestOffset < part.untilOffset
override def next(): ConsumerRecord[K, V] = {
assert(hasNext(), "Can't call getNext() once untilOffset has been reached")
val r = consumer.get(requestOffset, pollTimeout)
requestOffset += 1
r
}
}
}
PR地址:https://github.com/apache/spark/pull/19819
修改完源码后,在我的项目中建了同路径的包,将源码中的类复制到我的项目中并修改,pom文件不再引入spark-streaming-kafka-0-10_2.11依赖,引入spark-streaming-kafka-0-10_2.11此包所需要的依赖,如下
修改 spark-streaming-kafka-0-10_2.11所需的依赖
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.1.3</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.1.3</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_2.11</artifactId>
<version>0.10.0.1</version>
<exclusions>
<exclusion>
<groupId>com.sun.jmx</groupId>
<artifactId>jmxri</artifactId>
</exclusion>
<exclusion>
<groupId>com.sun.jdmk</groupId>
<artifactId>jmxtools</artifactId>
</exclusion>
<exclusion>
<groupId>net.sf.jopt-simple</groupId>
<artifactId>jopt-simple</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-simple</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>net.sf.jopt-simple</groupId>
<artifactId>jopt-simple</artifactId>
<version>3.2</version>
</dependency>
<dependency>
<groupId>org.scalacheck</groupId>
<artifactId>scalacheck_2.11</artifactId>
<version>1.14.3</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-tags_${scala.binary.version}</artifactId>
<version>2.1.3</version>
</dependency>
<!--
This spark-tags test-dep is needed even though it isn't used in this module, otherwise testing-cmds that exclude
them will yield errors.
-->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-tags_2.11</artifactId>
<version>2.1.3</version>
</dependency>
</dependencies>
之后重新打包,调整currentJobs参数,数据积压问题得到解决
参考博客:https://blog.csdn.net/u011707542/article/details/90734378