大数据组件-Kafka的javaAPI操作,Kafka StreamingAPI开发,

最新推荐文章于 2021-02-13 03:10:36 发布

程序猿与汪

最新推荐文章于 2021-02-13 03:10:36 发布

阅读量245

点赞数

分类专栏： Kafka 文章标签： kafka java redis 大数据 bootstrap

本文链接：https://blog.csdn.net/weixin_45154559/article/details/106381703

版权

Kafka 专栏收录该内容

10 篇文章 1 订阅

订阅专栏

1.KafkaJavaApi操作

1.添加maven依赖

 <dependencies>
        <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka-clients</artifactId>
            <version>0.10.0.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka-streams</artifactId>
            <version>0.10.0.0</version>
        </dependency>

    </dependencies>

    <build>
        <plugins>
            <!-- java编译插件 -->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.2</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                    <encoding>UTF-8</encoding>
                </configuration>
            </plugin>
        </plugins>
    </build>

2.生产者代码

kafkaproducerAPI文档

import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.Producer;
import org.apache.kafka.clients.producer.ProducerRecord;

import java.util.Properties;

public class MyProducer {
    /**
     * 实现生产数据到kafka test这个topic里面去
     * @param args
     */

    public static void main(String[] args) throws InterruptedException {
        Properties props = new Properties();
        props.put("bootstrap.servers", "node01:9092");
        props.put("acks", "all"); //消息确认机制
        props.put("retries", 0); //消息发送失败后重试次数
        props.put("batch.size", 16384); //处理一批数据大小
        props.put("linger.ms", 1); //消息每天都进行确认
        props.put("buffer.memory", 33554432); //缓冲区的大小
        //指定k和v序列化类StringSerializer
        props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
        props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");

        //获取kafkaProduce这个类
        Producer<String,String> kafkaProducer = new KafkaProducer<>(props);

        //使用循环发送消失
        for (int i = 0; i < 100; i++) {
            Thread.sleep(1200);
            kafkaProducer.send(new ProducerRecord<String, String>("test","mymessage"+i));//向test这个topic发送messagei这这个信息
        }
        //关闭资源
        kafkaProducer.close();
    }
}

3.1生产者分区策略

如果指定分区号,那么数据直接产生到对应的分区里面去
如果没有指定分区号,通过数据的key取其hashCode来计算数据落到那个分区
如果没有分区号,数据也不存在key,那么使用round-robin轮询来实现

package it.yuge;

import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.Producer;
import org.apache.kafka.clients.producer.ProducerRecord;

import java.util.Properties;

public class PartitionProducer {
    /**
     * kafka生成数据
     * @param args
     */
    public static void main(String[] args) throws InterruptedException {
        Properties props = new Properties();
        props.put("bootstrap.servers", "node01:9092");
        props.put("acks", "all"); //消息确认机制
        props.put("retries", 0); //消息发送失败后重试次数
        props.put("batch.size", 16384); //处理一批数据大小
        props.put("linger.ms", 1); //消息每天都进行确认
        props.put("buffer.memory", 33554432); //缓冲区的大小
        //指定k和v序列化类StringSerializer
        props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
        props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
        
        //匹配自定义分区类
        props.put("partitioner.class","it.yuge.MyPartition")
        
        //获取kafkaProduce这个类
        Producer<String,String> kafkaProducer = new KafkaProducer<>(props);

        //使用循环发送消失
        for (int i = 0; i < 100; i++) {
            
            //第一种分区策略:即没有指定分区号,又没有指定数据的key,那么使用轮询的方式将数据均匀的发送到不同的分区里面去
            ProducerRecord<String, String> producerRecord1 = new ProducerRecord<>("mypartition", "message" + i);
            //第二种分区策略:没有指定分区号,指定了数据的key,通过key.hashCode % numPartition来计算数据会落到那个分区
            ProducerRecord<String, String> producerRecord2 = new ProducerRecord<>("mypartition", "mykey", "mymessage" + i);
            //第三种分区策略:如果指定了分区号,那么就会将数据直接写入到对应的分区里面去
            ProducerRecord<String, String> producerRecord3 = new ProducerRecord<>("mypartition", 0, "mykey", "mymessage" + i);
            
            //自定义分区
            ProducerRecord<String, String> producerRecord4 = new ProducerRecord<>("mypartition", 0, "mykey", "mymessage" + i);

            kafkaProducer.send(producerRecord1);//向test这个topic发送messagei这这个信息
        }
        //关闭资源
        kafkaProducer.close();
    }
}

自定义分区类

package it.yuge;

import org.apache.kafka.clients.producer.Partitioner;
import org.apache.kafka.common.Cluster;

import java.util.Map;

public class MyPartition implements Partitioner {
    //这个方法就是确定分区数据到哪一个分区里面去
    //直接return 2 表示将数据写入到2号分区里面去
    @Override
    public int partition(String s, Object o, byte[] bytes, Object o1, byte[] bytes1, Cluster cluster) {
        return 0;
    }

    @Override
    public void close() {

    }

    @Override
    public void configure(Map<String, ?> map) {

    }
}

3.消费者代码

cunsumerAPI文档

offsit:记录了消息消费到了那一条,下一次来的时候,我们继续从上一次的记录接着消费
自动提交
手动提交

(1)自动提交offset

import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;

import java.util.Arrays;
import java.util.Properties;

public class MyConsumer {
    /**
     * 自动提交offset
     * @param args
     */
    public static void main(String[] args) {
        Properties props = new Properties();
        props.put("bootstrap.servers", "node01:9092");
        props.put("group.id", "test_group"); //消费组
        props.put("enable.auto.commit", "true"); //允许自动提交
        props.put("auto.commit.interval.ms", "1000"); //自动提交的间隔时间
        props.put("session.timeout.ms", "30000"); //超时时间
        //指定k和v的反序列化类StringDeserializer
        props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        KafkaConsumer<String, String> consumer = new KafkaConsumer<>(props);
        //指定消费那个topic里面的数据
        consumer.subscribe(Arrays.asList("test"));
        //使用死循环来消费test这个topic里面的数据
        while (true) {
            //records是所有拉取到的数据
            ConsumerRecords<String, String> records = consumer.poll(1000); //1000毫秒没拉到数据就认为超时
            for (ConsumerRecord<String, String> record : records) {
                long offset = record.offset();
                String value = record.value();
                System.out.println("消息的offset值为:"+offset+"消息的内容是:"+value);
            }
        }
    }
}

(2)手动提交offset

import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;

public class manualConsumer {
    /**
     * 实现手动提交offset
     * @param args
     */
    public static void main(String[] args) {
        Properties props = new Properties();
        props.put("bootstrap.servers", "node01:9092");
        props.put("group.id", "test_group");
        props.put("enable.auto.commit", "false"); //禁用自动提交offset,后期我们手动提交offset
        props.put("auto.commit.interval.ms", "1000");
        props.put("session.timeout.ms", "30000");
        props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        KafkaConsumer<String, String> consumer = new KafkaConsumer<>(props);
        
        //消费者订阅test这个topic
        consumer.subscribe(Arrays.asList("test"));
        
        final int minBatchSize = 100;//达到100条进行批次处理,处理完成后提交offset
        //定义一个集合,用于存储我们的ConsumerRecord(拉取的数据对象)
        List<ConsumerRecord<String, String>> consumerRecordList = new ArrayList<>();
        while (true) {
            ConsumerRecords<String, String> consumerRecords1 = consumer.poll(1000);
            for (ConsumerRecord<String, String> consumerRecord : consumerRecords1) {
                consumerRecordList.add(consumerRecord); //拉取的一批批数据往集合中存储
                if (consumerRecordList.size() >= minBatchSize) {
                    //如果集合当中的数据大于等于200条,我们批量进行一个处理
                    //将这一批次的数据保存到数据库里面
                    //insertTODb(consumerRecordList);//jdbc-伪代码

                    //提交offset,表示这一批次的数据全部都处理完了
                    //consumer.commitAsync(); //异步提交offset值,异步提交效率更高,不会阻塞代码的执行.

                    //同步提交offset值,同步是一个进入提交就上锁,其他等待,以保障线程安全,但是判断锁,释放锁线程效率低下
                    consumer.commitSync();
                    System.out.println("提交完成");
                    //清空集合数据
                    consumerRecordList.clear();
                }
            }
        }
    }
}

(3)处理完每个分区里面的数据之后,然后就进行一次提交(相比上面两种方式数据更安全)

package it.yuge;

import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.clients.consumer.OffsetAndMetadata;
import org.apache.kafka.common.TopicPartition;

import java.util.*;

public class ConmsumerPartition {
    /**
     * 处理完每一个分区里面数据,就马上提交这个分区里面的数据
     * @param args
     */
    public static void main(String[] args) {
        Properties props = new Properties();
        props.put("bootstrap.servers", "node01:9092");
        props.put("group.id", "test_group");
        props.put("enable.auto.commit", "false"); //禁用自动提交offset,后期我们手动提交offset
        props.put("auto.commit.interval.ms", "1000");
        props.put("session.timeout.ms", "30000");
        props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");

        KafkaConsumer<String, String> kafkaConsumer = new KafkaConsumer<>(props);

        kafkaConsumer.subscribe(Arrays.asList("mypartition"));
        while (true){
            //通过while true消费数据
            ConsumerRecords<String, String> consumerRecords = kafkaConsumer.poll(1000);
            //获取mypartition这个topic里面所有的分区
            Set<TopicPartition> partitions = consumerRecords.partitions();

            //循环遍历每一个分区里面数据,然后将每一个分区里面的数据进行处理,处理完成后再进行提交
            for (TopicPartition partition : partitions) {
                //获取每一个分区里面的数据
                List<ConsumerRecord<String, String>> records = consumerRecords.records(partition);
                for (ConsumerRecord<String, String> record : records) {
                    System.out.println(record.value()+"==="+record.offset());
                }
                //获取我们的分区里面最后一条数据的offset,表示我们已经消费到了这个offset了
                long offset = records.get(records.size() - 1).offset();

                //提交offset,使用Collection创建一个线程安全的map集合
                //提交我们offset,并且给offset值加1,表示我们从下沉没有消费的那一条数据开始消费
                kafkaConsumer.commitSync(Collections.singletonMap(partition,new OffsetAndMetadata(offset+1)));
            }
        }
    }
}

(4)指定消费topic当中某些分区的数据

package it.yuge;

import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.kafka.common.TopicPartition;

import java.util.Arrays;
import java.util.Properties;

public class ConsumerSomePartition {
    //实现消费一个topic里面某些分区的数据
    public static void main(String[] args) {
        Properties props = new Properties();
        props.put("bootstrap.servers", "node01:9092");
        props.put("group.id", "test_group");
        props.put("enable.auto.commit", "true"); //禁用自动提交offset,后期我们手动提交offset
        props.put("auto.commit.interval.ms", "1000");
        props.put("session.timeout.ms", "30000");
        props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");

        //获取kafkaConsumer
        KafkaConsumer<String, String> kafkaConsumer = new KafkaConsumer<String, String>(props);
        
        //通过consumer订阅某一个topic,进行消费,会消费topic里面所有的分区的数据
        //kafkaConsumer.subscribe();
        
        //通过调用assian发法实现消费mypartition这个topic里面0号和1号分区里面的数据
        TopicPartition topicPartition1 = new TopicPartition("mypartition", 0);
        TopicPartition topicPartition2 = new TopicPartition("mypartition", 1);
        kafkaConsumer.assign(Arrays.asList(topicPartition1,topicPartition2));
        
        while (true){
            ConsumerRecords<String, String> records = kafkaConsumer.poll(1000);
            //得到一条条的数据redcord
            for (ConsumerRecord<String, String> record : records) {
                System.out.println("数据值为"+record.value()+"偏移量为:"+record.offset());
            }
        }
    }
}

5.kafka Streams API开发

使用场景:
解决这样的需求:使用StreamAPI获取test这个topic当中的数据，然后将数据全部转为大写，写入到test2这个topic当中去
在这里插入图片描述

(1)创建一个topic

cd /export/servers/kafka_2.11-0.10.0.0/
bin/kafka-topics.sh --create  --partitions 3 --replication-factor 2 --topic test2 --zookeeper node01:2181,node02:2181,node03:2181

–create表示创建
–partition 3 表示有三个分区
–replication-factor 2 表示有两个副本
–topic test2 表示topic名字叫test2
–zookeeper 指定我们zookeeper的连接地址

(2)开发StreamAPI

public class StreamAPI {
	//通过StreamAPI实现将数据从test里面读取出来,写入到test2里面去
    public static void main(String[] args) {
    	//封装配置信息的方法
        Properties props = new Properties();
        //put一些参数
        props.put(StreamsConfig.APPLICATION_ID_CONFIG, "wordcount-application");//应用id名称
        props.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "node01:9092");//指定kafka连接地址
        //数据序列化反序列化
        props.put(StreamsConfig.KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass());
        props.put(StreamsConfig.VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass());

		//获取核心类KStreamBuilder
        KStreamBuilder builder = new KStreamBuilder();
        //通过KStreamBuilder调用stream方法,表示从那个topic当中获取数据
        //调用maoValues方法,表示将每一行value都给取出来,做map映射
        //.to("test2")将转成大写的数据写到test2这个topic当中去
        builder.stream("test").mapValues(line -> line.toString().toUpperCase()).to("test2");
        //通过KStreamBuilder和Properties(所有配置文件),来创建KafkaStreams,通过KafkaStreams来实现流式编程的启动
        KafkaStreams streams = new KafkaStreams(builder, props);
        //调用start启动kafka的流API
        streams.start();
    }
}

(3)生产数据

//node01执行以下命令，向test这个topic当中生产数据
cd /export/servers/kafka_2.11-0.10.0.0
bin/kafka-console-producer.sh --broker-list node01:9092,node02:9092,node03:9092 --topic test

(4)消费数据

//node02执行一下命令消费test2这个topic当中的数据
cd /export/servers/kafka_2.11-0.10.0.0
bin/kafka-console-consumer.sh --from-beginning  --topic test2 --zookeeper node01:2181,node02:2181,node03:2181