-
生产者API-producer
-
普通API
package bigdata.producer; import org.apache.kafka.clients.producer.KafkaProducer; import org.apache.kafka.clients.producer.ProducerConfig; import org.apache.kafka.clients.producer.ProducerRecord; import java.util.Properties; public class CustomProducer { public static void main(String[] args) { //kafka集群的配置信息 Properties props = new Properties(); //连接的虚拟机 props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG,"hadoop001:9092"); //配置ack机制 props.put(ProducerConfig.ACKS_CONFIG,"all"); //重试次数 props.put(ProducerConfig.RETRIES_CONFIG,1); //批次大小 props.put(ProducerConfig.BATCH_SIZE_CONFIG, 16384); //等待时间 props.put(ProducerConfig.LINGER_MS_CONFIG, 1); //缓冲区大小 props.put(ProducerConfig.BUFFER_MEMORY_CONFIG,33554432); //序列化 props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG,"org.apache.kafka.common.serialization.StringSerializer"); props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG,"org.apache.kafka.common.serialization.StringSerializer"); KafkaProducer kafkaProducer = new KafkaProducer(props); for (int i = 0; i < 100; i++) { kafkaProducer.send(new ProducerRecord("test","test-" + Integer.toString(i),"test-"+ Integer.toString(i))); } kafkaProducer.close(); } }
-
带回调函数的API
package bigdata.producer; import bigdata.patitioner.MyPatitioner; import org.apache.kafka.clients.producer.*; import java.util.Properties; public class CallBackProducer { public static void main(String[] args) { //kafka集群的配置信息 Properties props = new Properties(); //连接的虚拟机 props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG,"hadoop001:9092"); //配置ack机制 props.put(ProducerConfig.ACKS_CONFIG,"all"); //重试次数 props.put(ProducerConfig.RETRIES_CONFIG,1); //批次大小 props.put(ProducerConfig.BATCH_SIZE_CONFIG, 16384); //等待时间 props.put(ProducerConfig.LINGER_MS_CONFIG, 1); //缓冲区大小 props.put(ProducerConfig.BUFFER_MEMORY_CONFIG,33554432); //序列化 props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG,"org.apache.kafka.common.serialization.StringSerializer"); props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG,"org.apache.kafka.common.serialization.StringSerializer"); KafkaProducer<String, String> kafkaProducer = new KafkaProducer<>(props); for (int i = 0; i < 10; i++) { kafkaProducer.send(new ProducerRecord<>("first", "test-" + i, "test-" + i+"-kafka"), new Callback() { @Override public void onCompletion(RecordMetadata recordMetadata, Exception e) { if (e ==null) { System.out.println(recordMetadata.partition()+ "----"+recordMetadata.offset()); }else { e.printStackTrace(); } } }); } kafkaProducer.close(); } }
-
分区器API-partitioner
package bigdata.patitioner; import org.apache.kafka.clients.producer.Partitioner; import org.apache.kafka.common.Cluster; import java.util.Map; public class MyPatitioner implements Partitioner { @Override public int partition(String s, Object o, byte[] bytes, Object o1, byte[] bytes1, Cluster cluster) { return 1; } @Override public void close() { } @Override public void configure(Map<String, ?> map) { } }
需要在生产者的配置信息中配置自定义的分区器
//自定义分区器 props.put(ProducerConfig.PARTITIONER_CLASS_CONFIG, MyPatitioner.class);
-
生产者同步发送
生产者的发送数据有两条线程,为sender线程和main线程
同步发送的意思就是,一条消息发送之后,会阻塞当前main线程, 直至返回 ack。
由于 send 方法返回的是一个 Future 对象,根据 Futrue 对象的特点,我们也可以实现同步发送的效果,只需在调用 Future 对象的 get 方发即可
for (int i = 0; i < 10; i++) { kafkaProducer.send(new ProducerRecord<>("first", "test-" + i, "test-" + i+"-kafka"), new Callback() { @Override public void onCompletion(RecordMetadata recordMetadata, Exception e) { if (e ==null) { System.out.println(recordMetadata.partition()+ "----"+recordMetadata.offset()); }else { e.printStackTrace(); } } }).get(); }
-
-
消费者API -Consumer
-
简单的消费者API
package bigdata.consumer; import org.apache.kafka.clients.consumer.ConsumerConfig; import org.apache.kafka.clients.consumer.ConsumerRecord; import org.apache.kafka.clients.consumer.ConsumerRecords; import org.apache.kafka.clients.consumer.KafkaConsumer; import java.util.Arrays; import java.util.Collections; import java.util.Properties; public class CustomConsumer { public static void main(String[] args) { //配置文件 Properties props = new Properties(); //配置主机 props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "hadoop001:9092"); //配置消费者组id props.put(ConsumerConfig.GROUP_ID_CONFIG,"123"); //配置是否允许提交 props.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, true); //配置一次的提交时间 props.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, 1000); //配置反序列化 props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG,"org.apache.kafka.common.serialization.StringDeserializer"); props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG,"org.apache.kafka.common.serialization.StringDeserializer"); KafkaConsumer<String, String> kafkaConsumer = new KafkaConsumer<>(props); kafkaConsumer.subscribe(Collections.singletonList("first")); while (true) { ConsumerRecords<String, String> records = kafkaConsumer.poll(100); for (ConsumerRecord<String, String> record : records) { System.out.printf("offset = %d, key = %s, value = %s%n", record.offset(), record.key(), record.value()); } } } }
-
消费者API-重置offset
Consumer 消费数据时的可靠性是很容易保证的,因为数据在 Kafka 中是持久化的,故不用担心数据丢失问题。
由于 consumer 在消费过程中可能会出现断电宕机等故障, consumer 恢复后,需要从故障前的位置的继续消费,所以consumer 需要实时记录自己消费到了哪个 offset,以便故障恢复后继续消费。
若是需要重置offset,即类似命令行运行消费者时的参数–from-beginning,需要在消费者配置中加入下列参数
props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, “earliest”);
-
手动提交offset
对于消费者来说,读取数据后提交offset有两种方法,
-
commitSync(同步提交)
会阻塞当前进程,直至提交成功才继续,但如果写入之后,在提交过程中宕机,会使得数据重复
-
commitAsync(异步提交)
不会阻塞当前进程,提交并且继续读取数据,但如果还未完全写完该数据,offset就已经提交成功,则会使得数据丢失。
因此,一般将offset提交和数据写入放入事务中,使得两者保持一致性
-
-
-
拦截器API-Interceptor
-
时间拦截器
package bigdata.interceptor; import org.apache.kafka.clients.producer.ProducerInterceptor; import org.apache.kafka.clients.producer.ProducerRecord; import org.apache.kafka.clients.producer.RecordMetadata; import java.util.Map; public class TimeInterceptor implements ProducerInterceptor { //配置文件 @Override public void configure(Map<String, ?> map) { } //逻辑处理,得到数据后哪些数据需要拦截 @Override public ProducerRecord onSend(ProducerRecord producerRecord) { return new ProducerRecord(producerRecord.topic(),producerRecord.partition(),producerRecord.timestamp(),producerRecord.key(), "TimeInterceptor: " + System.currentTimeMillis() + "," + producerRecord.value().toString()); } @Override public void onAcknowledgement(RecordMetadata recordMetadata, Exception e) { } @Override public void close() { } }
-
记录发送成功和失败数量的拦截器
package bigdata.interceptor; import org.apache.kafka.clients.producer.ProducerInterceptor; import org.apache.kafka.clients.producer.ProducerRecord; import org.apache.kafka.clients.producer.RecordMetadata; import java.util.Map; public class CounterInterceptor implements ProducerInterceptor { private Long successCounter=0L; private Long errorCounter=0L; @Override public void configure(Map<String, ?> map) { } @Override public ProducerRecord onSend(ProducerRecord producerRecord) { return producerRecord; } @Override public void onAcknowledgement(RecordMetadata recordMetadata, Exception e) { //统计成功和失败的次数 if (e == null){ successCounter++; }else { errorCounter++; } } @Override public void close() { System.out.println("Successful set : " + successCounter); System.out.println("Failed set : " + errorCounter); } }
-
在Producer中定义拦截器
//自定义拦截器 ArrayList<String> interceptors = new ArrayList<>(); interceptors.add("bigdata.interceptor.TimeInterceptor"); interceptors.add("bigdata.interceptor.CounterInterceptor");
-
-
Flume配置kafka
flume配置kafka只需要在定义sink为kafka,同时配置拦截器
要注意的是,拦截器中添加头信息的位置添加的key-value值需要变为“topic”-“tipic_name”,此时flume会根据topic的值来添加进不同的主题中,如:
请添加图片描述
flume的配置文件如下:
# define
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# source
a1.sources.r1.type = exec
a1.sources.r1.command = tail -F -c +0 /opt/app/datas/flume.log
a1.sources.r1.shell = /bin/bash -c
# sink
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.kafka.bootstrap.servers = hadoop001:9092,hadoop002:9092,hadoop003:9092
a1.sinks.k1.kafka.topic = first
a1.sinks.k1.kafka.flumeBatchSize = 20
a1.sinks.k1.kafka.producer.acks = 1
a1.sinks.k1.kafka.producer.linger.ms = 1
# channel
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# bind
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1