文章目录
一、准备工作
下载consumer.properties、producer.properties文件到resources中
1.1 producer.properties
1.修改 bootstrap.servers=host01:9092,host03:9092,host04:9092
2.配置 batch.size=16384(自己决定写多大)
3.配置 buffer.memory=33554432 (自己决定写多大)
4.配置序列化机制
key.serializer=org.apache.kafka.common.serialization.StringSerializer
value.serializer=org.apache.kafka.common.serialization.StringSerializer
1.2 consumer.properties
1.修改 bootstrap.servers=host01:9092,host03:9092,host04:9092
2.配置 auto.offset.reset=earliest (自己决定从哪里开始)
3.配置反序列化机制
key.deserializer=org.apache.kafka.common.serialization.StringDeserializer
value.deserializer=org.apache.kafka.common.serialization.StringDeserializer
二、封装工具类
import java.util.Properties
import org.apache.kafka.clients.consumer.KafkaConsumer
import org.apache.kafka.clients.producer.KafkaProducer
import scala.collection.JavaConverters._
/**
* Kafka的工具类
*
* 可以获取到一个Producer和一个Consumer
*
*/
object KafkaHelper {
private val PATH_PROPERTIES_PRODUCER: String = "producer.properties"
private val PATH_PROPERTIES_CONSUMER: String = "consumer.properties"
// 获取一个生产者
def getProducer: KafkaProducer[String, String] = new KafkaProducer[String, String](loadProperties(PATH_PROPERTIES_PRODUCER))
// 定制化的实例化生产者
def getProducer(config: Map[String, String]): KafkaProducer[String, String] = {
val properties: Properties = loadProperties(PATH_PROPERTIES_PRODUCER)
properties.putAll(config.asJava)
new KafkaProducer[String, String](properties)
}
// 获取一个消费者
def getConsumer: KafkaConsumer[String, String] = new KafkaConsumer[String, String](loadProperties(PATH_PROPERTIES_CONSUMER))
// 定制化的实例化消费者
def getConsumer(config: Map[String, String]): KafkaConsumer[String, String] = {
val properties: Properties = loadProperties(PATH_PROPERTIES_CONSUMER)
properties.putAll(config.asJava)
new KafkaConsumer[String, String](properties)
}
/**
* 通过properties的路径,加载不同的properties文件
* @param url 路径
* @return
*/
private def loadProperties(url: String): Properties = {
val properties: Properties = new Properties()
properties.load(KafkaHelper.getClass.getClassLoader.getResourceAsStream(url))
properties
}
}
三、偏移量offset
* 问题描述:
* 消费者如果正在运行中,消费数据的时候,会按照offset进行消费。
* 消费了一个数据,offset就会自增一次,这样可以保证有顺序的消费数据。
* 但是!
* 如果消费者在消费的过程中宕机了,当再次启动消费者的时候:
* 1. 如果从头消费,会造成一部分数据的冗余,因为出现了重复消费的情况。
* 2. 如果从最新的开始,会造成一部分的数据丢失(在消费者宕机的过程中,生产者可能还在生产)
* 3. 指定宕机之前的offset,下次启动的时候,就从这个offset开始消费。
* Kafka自动提交offset: (offset的自动管理)
* kafka会自动的,周期性的提交offset。不需要我们手动保存offset,每隔一段时间,kafka会自动的将offset保存起来。
* 下次再启动程序的时候,就可以从指定的offset开始读取数据了。
* 弊端:
* 如果在一个周期内,(上次的offset已经提交了,又进行了消费,但是还没到下次保存的时候)宕机了。
* 自动提交的方式,保存的offset并不是最新的数据。
* 下次再启动程序的时候,还可能会出现重复的数据。这个重复的数据量比从头开始要少很多。
* Kafka手动管理offset:
* 手动管理offset,在SparkStreaming部分讲。
* 如果需要手动管理offset,首先需要先从kafaka中提取出offset。
* 每次消费时候,将这个offset存起来(可以在系统中,或者在数据库中)。
* 下次启动消费者的时候,就可以读取这个保存的offset,从指定的位置开始读取数据了。
* 这样就不会出现数据重复或者丢失的问题了。
自动管理案例
import java.util
import java.util.Collections
import Day15_Kafka.KafkaHelper
import org.apache.kafka.clients.consumer.{ConsumerRecord, ConsumerRecords, KafkaConsumer}
object OffsetOperation {
def main(args: Array[String]): Unit = {
// 1. 获取Consumer
val consumer: KafkaConsumer[String, String] = KafkaHelper.getConsumer(Map[String, String](
("enable.auto.commit", "true"), // 打开自动提交offset的功能
("auto.commit.interval.ms", "10000") // 设置自动提交offset的周期,单位是毫秒
))
// 2. 订阅topic
consumer.subscribe(Collections.singletonList("test"))
// 3.
while (true) {
// 拉取数据
val records: ConsumerRecords[String, String] = consumer.poll(1000)
// 数据遍历
val iterator: util.Iterator[ConsumerRecord[String, String]] = records.iterator()
while (iterator.hasNext) {
val record: ConsumerRecord[String, String] = iterator.next()
println(s"partition: ${record.partition}, offset: ${record.offset}, key: ${record.key}, value: ${record.value}")
}
}
}
}
手动管理看最后一个案例—— 从Kafka获取消息并存储到Redis中(Kafka+SparkSteaming+Redis)
四、自定义分区器
4.1随机分区
package Day15_Kafka._03_partitions
import java.util
import java.util.Collections
import Day15_Kafka.KafkaHelper
import org.apache.kafka.clients.consumer.{ConsumerRecord, ConsumerRecords, KafkaConsumer}
import org.apache.kafka.clients.producer.{KafkaProducer, Partitioner, ProducerRecord}
import org.apache.kafka.common.Cluster
import org.junit.Test
import scala.util.Random
class PartitionTest1 {
@Test def producer(): Unit = {
val producer: KafkaProducer[String, String] = KafkaHelper.getProducer(Map[String, String](
// 指定分区器
("partitioner.class", "Day15_Kafka._03_partitions.RandomPartitioner")
))
for (i <- 100 to 150) {
val record: ProducerRecord[String, String] = new ProducerRecord[String, String]("test", s"$i message")
producer.send(record)
Thread.sleep(300)
}
}
@Test def consumer(): Unit = {
println("消费者已经启动了")
// 1. 获取Consumer
val consumer: KafkaConsumer[String, String] = KafkaHelper.getConsumer(Map[String, String](
("auto.offset.reset", "latest")
))
// 2. 订阅topic
consumer.subscribe(Collections.singletonList("test"))
// 3.
while (true) {
// 拉取数据
val records: ConsumerRecords[String, String] = consumer.poll(1000)
// 数据遍历
val iterator: util.Iterator[ConsumerRecord[String, String]] = records.iterator()
while (iterator.hasNext) {
val record: ConsumerRecord[String, String] = iterator.next()
println(s"partition: ${record.partition}, offset: ${record.offset}, key: ${record.key}, value: ${record.value}")
}
}
}
}
// 随机分区
// 自定义一个分区器, 继承特质: org.apache.kafka.clients.producer.Partitioner
// 这里有三个方法,但是只有一个是有用的: partition
class RandomPartitioner extends Partitioner {
override def partition(topic: String, key: Any, keyBytes: Array[Byte], value: Any, valueBytes: Array[Byte], cluster: Cluster): Int = {
// 参数cluster表示kafka集群
// 获取这个topic的分区数量
val partitions: Integer = cluster.partitionCountForTopic(topic)
Random.nextInt(partitions)
}
override def close(): Unit = {}
override def configure(configs: util.Map[String, _]): Unit = {}
}
4.2哈希分区
package Day15_Kafka._03_partitions
import java.util
import java.util.Collections
import Day15_Kafka.KafkaHelper
import org.apache.kafka.clients.consumer.{ConsumerRecord, ConsumerRecords, KafkaConsumer}
import org.apache.kafka.clients.producer.{KafkaProducer, Partitioner, ProducerRecord}
import org.apache.kafka.common.Cluster
import org.junit.Test
import scala.util.Random
class PartitionTest2 {
@Test def producer(): Unit = {
val producer: KafkaProducer[String, String] = KafkaHelper.getProducer(Map[String, String](
// 指定分区器
("partitioner.class", "Day15_Kafka._03_partitions.MyHashPartitioner")
))
for (i <- 100 to 150) {
val record: ProducerRecord[String, String] = new ProducerRecord[String, String]("test", s"${Random.nextInt(100)} key", s"$i message")
producer.send(record)
Thread.sleep(300)
}
}
@Test def consumer(): Unit = {
println("消费者已经启动了")
// 1. 获取Consumer
val consumer: KafkaConsumer[String, String] = KafkaHelper.getConsumer(Map[String, String](
("auto.offset.reset", "latest")
))
// 2. 订阅topic
consumer.subscribe(Collections.singletonList("test"))
// 3.
while (true) {
// 拉取数据
val records: ConsumerRecords[String, String] = consumer.poll(1000)
// 数据遍历
val iterator: util.Iterator[ConsumerRecord[String, String]] = records.iterator()
while (iterator.hasNext) {
val record: ConsumerRecord[String, String] = iterator.next()
println(s"partition: ${record.partition}, offset: ${record.offset}, key: ${record.key}, value: ${record.value}")
}
}
}
}
// 哈希分区
// 自定义一个分区器, 继承特质: org.apache.kafka.clients.producer.Partitioner
// 这里有三个方法,但是只有一个是有用的: partition
class MyHashPartitioner extends Partitioner {
override def partition(topic: String, key: Any, keyBytes: Array[Byte], value: Any, valueBytes: Array[Byte], cluster: Cluster): Int = {
// 获取总的分区数
val partitions: Integer = cluster.partitionCountForTopic(topic)
if (key == null) 0 else Math.abs(key.hashCode()) % partitions
}
override def close(): Unit = {}
override def configure(configs: util.Map[String, _]): Unit = {}
}
4.3轮询分区
package Day15_Kafka._03_partitions
import java.util
import java.util.Collections
import java.util.concurrent.atomic.AtomicInteger
import Day15_Kafka.KafkaHelper
import org.apache.kafka.clients.consumer.{ConsumerRecord, ConsumerRecords, KafkaConsumer}
import org.apache.kafka.clients.producer.{KafkaProducer, Partitioner, ProducerRecord}
import org.apache.kafka.common.Cluster
import org.junit.Test
import scala.util.Random
class PartitionTest3 {
@Test def producer(): Unit = {
val producer: KafkaProducer[String, String] = KafkaHelper.getProducer(Map[String, String](
// 指定分区器
("partitioner.class", "Day15_Kafka._03_partitions.RoundRobinPartitioner")
))
for (i <- 100 to 150) {
val record: ProducerRecord[String, String] = new ProducerRecord[String, String]("test", s"${Random.nextInt(100)} key", s"$i message")
producer.send(record)
Thread.sleep(300)
}
}
@Test def consumer(): Unit = {
println("消费者已经启动了")
// 1. 获取Consumer
val consumer: KafkaConsumer[String, String] = KafkaHelper.getConsumer(Map[String, String](
("auto.offset.reset", "latest")
))
// 2. 订阅topic
consumer.subscribe(Collections.singletonList("test"))
// 3.
while (true) {
// 拉取数据
val records: ConsumerRecords[String, String] = consumer.poll(1000)
// 数据遍历
val iterator: util.Iterator[ConsumerRecord[String, String]] = records.iterator()
while (iterator.hasNext) {
val record: ConsumerRecord[String, String] = iterator.next()
println(s"partition: ${record.partition}, offset: ${record.offset}, key: ${record.key}, value: ${record.value}")
}
}
}
}
// 轮询分区
// 自定义一个分区器, 继承特质: org.apache.kafka.clients.producer.Partitioner
// 这里有三个方法,但是只有一个是有用的: partition
class RoundRobinPartitioner extends Partitioner {
// 用来统计累加的数量
private val count: AtomicInteger = new AtomicInteger()
// getAndIncrement: 获取累加的值,并且自增1
override def partition(topic: String, key: Any, keyBytes: Array[Byte], value: Any, valueBytes: Array[Byte], cluster: Cluster): Int = {
// 获取总的分区数
val partitions: Integer = cluster.partitionCountForTopic(topic)
count.getAndIncrement() % partitions
}
override def close(): Unit = {}
override def configure(configs: util.Map[String, _]): Unit = {}
}
4.4案例: 分组分区器
package Day15_Kafka._03_partitions
import java.net.URL
import java.util
import java.util.Collections
import Day15_Kafka.KafkaHelper
import org.apache.kafka.clients.consumer.{ConsumerRecord, ConsumerRecords, KafkaConsumer}
import org.apache.kafka.clients.producer.{KafkaProducer, Partitioner, ProducerRecord}
import org.apache.kafka.common.Cluster
import org.junit.Test
import scala.io.Source
import scala.util.Random
/**
* 案例: 分组分区器
* 实现效果:
* 读取一个文件access.txt,将每一个学科的所有的数据,放到一个分区中
*
*/
class PartitionTest4 {
@Test def producer(): Unit = {
val producer: KafkaProducer[String, String] = KafkaHelper.getProducer(Map[String, String](
// 指定分区器
("partitioner.class", "Day15_Kafka._03_partitions.GroupedPartitioner")
))
val all: String = Source.fromFile("file/access.txt").mkString
val lines: Array[String] = all.split("\n|\r\n")
for (line <- lines) {
val record: ProducerRecord[String, String] = new ProducerRecord[String, String]("access", line)
producer.send(record)
Thread.sleep(500)
}
producer.close()
}
@Test def consumer(): Unit = {
println("消费者已经启动了")
// 1. 获取Consumer
val consumer: KafkaConsumer[String, String] = KafkaHelper.getConsumer
// 2. 订阅topic
consumer.subscribe(Collections.singletonList("access"))
// 3.
while (true) {
// 拉取数据
val records: ConsumerRecords[String, String] = consumer.poll(1000)
// 数据遍历
val iterator: util.Iterator[ConsumerRecord[String, String]] = records.iterator()
while (iterator.hasNext) {
val record: ConsumerRecord[String, String] = iterator.next()
println(s"partition: ${record.partition}, offset: ${record.offset}, key: ${record.key}, value: ${record.value}")
}
}
}
}
class GroupedPartitioner extends Partitioner {
private val map: Map[String, Int] = Map(
("java.learn.com", 0),
("ui.learn.com", 1),
("bigdata.learn.com", 2),
("android.learn.com", 3),
("h5.learn.com", 4)
)
// 这里并不能决定topic有多少分区,能够决定的就是这个数据需要放到哪个分区中
// 要求: 这个topic事先已经做好分区了
override def partition(topic: String, key: Any, keyBytes: Array[Byte], value: Any, valueBytes: Array[Byte], cluster: Cluster): Int = {
// 生产者生产数据的时候,不进行拆分,直接将文件中的每一行数据都写进来
// 在分区的时候,就得切分一下数据
try {
val url: String = value.toString.split("\\s+")(1)
val host: String = new URL(url).getHost
map.getOrElse(host, 5)
}catch {
case _: Exception => 5
}
}
override def close(): Unit = {}
override def configure(configs: util.Map[String, _]): Unit = {}
}