1.KafkaJavaApi操作
1.添加maven依赖
<dependencies>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>0.10.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-streams</artifactId>
<version>0.10.0.0</version>
</dependency>
</dependencies>
<build>
<plugins>
<!-- java编译插件 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.2</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
</plugins>
</build>
2.生产者代码
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.Producer;
import org.apache.kafka.clients.producer.ProducerRecord;
import java.util.Properties;
public class MyProducer {
/**
* 实现生产数据到kafka test这个topic里面去
* @param args
*/
public static void main(String[] args) throws InterruptedException {
Properties props = new Properties();
props.put("bootstrap.servers", "node01:9092");
props.put("acks", "all"); //消息确认机制
props.put("retries", 0); //消息发送失败后重试次数
props.put("batch.size", 16384); //处理一批数据大小
props.put("linger.ms", 1); //消息每天都进行确认
props.put("buffer.memory", 33554432); //缓冲区的大小
//指定k和v序列化类StringSerializer
props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
//获取kafkaProduce这个类
Producer<String,String> kafkaProducer = new KafkaProducer<>(props);
//使用循环发送消失
for (int i = 0; i < 100; i++) {
Thread.sleep(1200);
kafkaProducer.send(new ProducerRecord<String, String>("test","mymessage"+i));//向test这个topic发送messagei这这个信息
}
//关闭资源
kafkaProducer.close();
}
}
3.1生产者分区策略
- 如果指定分区号,那么数据直接产生到对应的分区里面去
- 如果没有指定分区号,通过数据的key取其hashCode来计算数据落到那个分区
- 如果没有分区号,数据也不存在key,那么使用round-robin轮询来实现
package it.yuge;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.Producer;
import org.apache.kafka.clients.producer.ProducerRecord;
import java.util.Properties;
public class PartitionProducer {
/**
* kafka生成数据
* @param args
*/
public static void main(String[] args) throws InterruptedException {
Properties props = new Properties();
props.put("bootstrap.servers", "node01:9092");
props.put("acks", "all"); //消息确认机制
props.put("retries", 0); //消息发送失败后重试次数
props.put("batch.size", 16384); //处理一批数据大小
props.put("linger.ms", 1); //消息每天都进行确认
props.put("buffer.memory", 33554432); //缓冲区的大小
//指定k和v序列化类StringSerializer
props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
//匹配自定义分区类
props.put("partitioner.class","it.yuge.MyPartition")
//获取kafkaProduce这个类
Producer<String,String> kafkaProducer = new KafkaProducer<>(props);
//使用循环发送消失
for (int i = 0; i < 100; i++) {
//第一种分区策略:即没有指定分区号,又没有指定数据的key,那么使用轮询的方式将数据均匀的发送到不同的分区里面去
ProducerRecord<String, String> producerRecord1 = new ProducerRecord<>("mypartition", "message" + i);
//第二种分区策略:没有指定分区号,指定了数据的key,通过key.hashCode % numPartition来计算数据会落到那个分区
ProducerRecord<String, String> producerRecord2 = new ProducerRecord<>("mypartition", "mykey", "mymessage" + i);
//第三种分区策略:如果指定了分区号,那么就会将数据直接写入到对应的分区里面去
ProducerRecord<String, String> producerRecord3 = new ProducerRecord<>("mypartition", 0, "mykey", "mymessage" + i);
//自定义分区
ProducerRecord<String, String> producerRecord4 = new ProducerRecord<>("mypartition", 0, "mykey", "mymessage" + i);
kafkaProducer.send(producerRecord1);//向test这个topic发送messagei这这个信息
}
//关闭资源
kafkaProducer.close();
}
}
自定义分区类
package it.yuge;
import org.apache.kafka.clients.producer.Partitioner;
import org.apache.kafka.common.Cluster;
import java.util.Map;
public class MyPartition implements Partitioner {
//这个方法就是确定分区数据到哪一个分区里面去
//直接return 2 表示将数据写入到2号分区里面去
@Override
public int partition(String s, Object o, byte[] bytes, Object o1, byte[] bytes1, Cluster cluster) {
return 0;
}
@Override
public void close() {
}
@Override
public void configure(Map<String, ?> map) {
}
}
3.消费者代码
- offsit:记录了消息消费到了那一条,下一次来的时候,我们继续从上一次的记录接着消费
- 自动提交
- 手动提交
(1)自动提交offset
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import java.util.Arrays;
import java.util.Properties;
public class MyConsumer {
/**
* 自动提交offset
* @param args
*/
public static void main(String[] args) {
Properties props = new Properties();
props.put("bootstrap.servers", "node01:9092");
props.put("group.id", "test_group"); //消费组
props.put("enable.auto.commit", "true"); //允许自动提交
props.put("auto.commit.interval.ms", "1000"); //自动提交的间隔时间
props.put("session.timeout.ms", "30000"); //超时时间
//指定k和v的反序列化类StringDeserializer
props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
KafkaConsumer<String, String> consumer = new KafkaConsumer<>(props);
//指定消费那个topic里面的数据
consumer.subscribe(Arrays.asList("test"));
//使用死循环来消费test这个topic里面的数据
while (true) {
//records是所有拉取到的数据
ConsumerRecords<String, String> records = consumer.poll(1000); //1000毫秒没拉到数据就认为超时
for (ConsumerRecord<String, String> record : records) {
long offset = record.offset();
String value = record.value();
System.out.println("消息的offset值为:"+offset+"消息的内容是:"+value);
}
}
}
}
(2)手动提交offset
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
public class manualConsumer {
/**
* 实现手动提交offset
* @param args
*/
public static void main(String[] args) {
Properties props = new Properties();
props.put("bootstrap.servers", "node01:9092");
props.put("group.id", "test_group");
props.put("enable.auto.commit", "false"); //禁用自动提交offset,后期我们手动提交offset
props.put("auto.commit.interval.ms", "1000");
props.put("session.timeout.ms", "30000");
props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
KafkaConsumer<String, String> consumer = new KafkaConsumer<>(props);
//消费者订阅test这个topic
consumer.subscribe(Arrays.asList("test"));
final int minBatchSize = 100;//达到100条进行批次处理,处理完成后提交offset
//定义一个集合,用于存储我们的ConsumerRecord(拉取的数据对象)
List<ConsumerRecord<String, String>> consumerRecordList = new ArrayList<>();
while (true) {
ConsumerRecords<String, String> consumerRecords1 = consumer.poll(1000);
for (ConsumerRecord<String, String> consumerRecord : consumerRecords1) {
consumerRecordList.add(consumerRecord); //拉取的一批批数据往集合中存储
if (consumerRecordList.size() >= minBatchSize) {
//如果集合当中的数据大于等于200条,我们批量进行一个处理
//将这一批次的数据保存到数据库里面
//insertTODb(consumerRecordList);//jdbc-伪代码
//提交offset,表示这一批次的数据全部都处理完了
//consumer.commitAsync(); //异步提交offset值,异步提交效率更高,不会阻塞代码的执行.
//同步提交offset值,同步是一个进入提交就上锁,其他等待,以保障线程安全,但是判断锁,释放锁线程效率低下
consumer.commitSync();
System.out.println("提交完成");
//清空集合数据
consumerRecordList.clear();
}
}
}
}
}
(3)处理完每个分区里面的数据之后,然后就进行一次提交(相比上面两种方式数据更安全)
package it.yuge;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.clients.consumer.OffsetAndMetadata;
import org.apache.kafka.common.TopicPartition;
import java.util.*;
public class ConmsumerPartition {
/**
* 处理完每一个分区里面数据,就马上提交这个分区里面的数据
* @param args
*/
public static void main(String[] args) {
Properties props = new Properties();
props.put("bootstrap.servers", "node01:9092");
props.put("group.id", "test_group");
props.put("enable.auto.commit", "false"); //禁用自动提交offset,后期我们手动提交offset
props.put("auto.commit.interval.ms", "1000");
props.put("session.timeout.ms", "30000");
props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
KafkaConsumer<String, String> kafkaConsumer = new KafkaConsumer<>(props);
kafkaConsumer.subscribe(Arrays.asList("mypartition"));
while (true){
//通过while true消费数据
ConsumerRecords<String, String> consumerRecords = kafkaConsumer.poll(1000);
//获取mypartition这个topic里面所有的分区
Set<TopicPartition> partitions = consumerRecords.partitions();
//循环遍历每一个分区里面数据,然后将每一个分区里面的数据进行处理,处理完成后再进行提交
for (TopicPartition partition : partitions) {
//获取每一个分区里面的数据
List<ConsumerRecord<String, String>> records = consumerRecords.records(partition);
for (ConsumerRecord<String, String> record : records) {
System.out.println(record.value()+"==="+record.offset());
}
//获取我们的分区里面最后一条数据的offset,表示我们已经消费到了这个offset了
long offset = records.get(records.size() - 1).offset();
//提交offset,使用Collection创建一个线程安全的map集合
//提交我们offset,并且给offset值加1,表示我们从下沉没有消费的那一条数据开始消费
kafkaConsumer.commitSync(Collections.singletonMap(partition,new OffsetAndMetadata(offset+1)));
}
}
}
}
(4)指定消费topic当中某些分区的数据
package it.yuge;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.TopicPartition;
import java.util.Arrays;
import java.util.Properties;
public class ConsumerSomePartition {
//实现消费一个topic里面某些分区的数据
public static void main(String[] args) {
Properties props = new Properties();
props.put("bootstrap.servers", "node01:9092");
props.put("group.id", "test_group");
props.put("enable.auto.commit", "true"); //禁用自动提交offset,后期我们手动提交offset
props.put("auto.commit.interval.ms", "1000");
props.put("session.timeout.ms", "30000");
props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
//获取kafkaConsumer
KafkaConsumer<String, String> kafkaConsumer = new KafkaConsumer<String, String>(props);
//通过consumer订阅某一个topic,进行消费,会消费topic里面所有的分区的数据
//kafkaConsumer.subscribe();
//通过调用assian发法实现消费mypartition这个topic里面0号和1号分区里面的数据
TopicPartition topicPartition1 = new TopicPartition("mypartition", 0);
TopicPartition topicPartition2 = new TopicPartition("mypartition", 1);
kafkaConsumer.assign(Arrays.asList(topicPartition1,topicPartition2));
while (true){
ConsumerRecords<String, String> records = kafkaConsumer.poll(1000);
//得到一条条的数据redcord
for (ConsumerRecord<String, String> record : records) {
System.out.println("数据值为"+record.value()+"偏移量为:"+record.offset());
}
}
}
}
5.kafka Streams API开发
使用场景:
解决这样的需求:使用StreamAPI获取test这个topic当中的数据,然后将数据全部转为大写,写入到test2这个topic当中去
(1)创建一个topic
cd /export/servers/kafka_2.11-0.10.0.0/
bin/kafka-topics.sh --create --partitions 3 --replication-factor 2 --topic test2 --zookeeper node01:2181,node02:2181,node03:2181
–create表示创建
–partition 3 表示有三个分区
–replication-factor 2 表示有两个副本
–topic test2 表示topic名字叫test2
–zookeeper 指定我们zookeeper的连接地址
(2)开发StreamAPI
public class StreamAPI {
//通过StreamAPI实现将数据从test里面读取出来,写入到test2里面去
public static void main(String[] args) {
//封装配置信息的方法
Properties props = new Properties();
//put一些参数
props.put(StreamsConfig.APPLICATION_ID_CONFIG, "wordcount-application");//应用id名称
props.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "node01:9092");//指定kafka连接地址
//数据序列化反序列化
props.put(StreamsConfig.KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass());
props.put(StreamsConfig.VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass());
//获取核心类KStreamBuilder
KStreamBuilder builder = new KStreamBuilder();
//通过KStreamBuilder调用stream方法,表示从那个topic当中获取数据
//调用maoValues方法,表示将每一行value都给取出来,做map映射
//.to("test2")将转成大写的数据写到test2这个topic当中去
builder.stream("test").mapValues(line -> line.toString().toUpperCase()).to("test2");
//通过KStreamBuilder和Properties(所有配置文件),来创建KafkaStreams,通过KafkaStreams来实现流式编程的启动
KafkaStreams streams = new KafkaStreams(builder, props);
//调用start启动kafka的流API
streams.start();
}
}
(3)生产数据
//node01执行以下命令,向test这个topic当中生产数据
cd /export/servers/kafka_2.11-0.10.0.0
bin/kafka-console-producer.sh --broker-list node01:9092,node02:9092,node03:9092 --topic test
(4)消费数据
//node02执行一下命令消费test2这个topic当中的数据
cd /export/servers/kafka_2.11-0.10.0.0
bin/kafka-console-consumer.sh --from-beginning --topic test2 --zookeeper node01:2181,node02:2181,node03:2181