kafka stream -- 单词统计(wordCount)

目录

程序流程: 

目录结构:

pom.xml 

 流规则定义

生产数据 

消费数据 

数据源


程序流程: 

将txt数据源发送到topic1:TextLinesTopic

StreamBuilder 监听 topic1,并定制规则,输出到topic2:WordsWithCountsTopic

consumer从topic2中获取数据,并输出

整个过程中最核心内容是规则制作业务需求的规则

目录结构:

pom.xml 

提供的kafka版本为2.1.0 

      <properties>
           <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
           <junit>4.12</junit>
           <kafka>2.1.0</kafka>
       </properties>

        <!-- kafka Producer、Consumer API -->
        <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka-clients</artifactId>
            <version>${kafka}</version>
        </dependency>

        <!-- kafka Stream API -->
        <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka-streams</artifactId>
            <version>${kafka}</version>
        </dependency>

 流规则定义

package com.i2yun.kafka;

import java.util.Arrays;
import java.util.Properties;

import org.apache.kafka.common.serialization.Serdes;
import org.apache.kafka.streams.KafkaStreams;
import org.apache.kafka.streams.StreamsBuilder;
import org.apache.kafka.streams.StreamsConfig;
import org.apache.kafka.streams.Topology;
import org.apache.kafka.streams.kstream.KStream;
import org.apache.kafka.streams.kstream.KTable;
import org.apache.kafka.streams.kstream.Materialized;
import org.apache.kafka.streams.kstream.Produced;

/**
 * to run this demo, need to :<br>
 * 
 * 1. start zookeeper and kafka
 * 
 * <pre>
 * nohup bin/zookeeper-server-start.sh config/zookeeper.properties >>zookeeper.out 2>&1 &
 * nohup bin/kafka-server-start.sh config/server.properties >/dev/null 2>&1 &
 * </pre>
 * 
 * 2. create topic TextLinesTopic and WordsWithCountsTopic
 * 
 * <pre>
 * bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic TextLinesTopic
 * bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic WordsWithCountsTopic
 * </pre>
 * 
 * 3.make data source file: doc/testFolder/wordCont.txt
 * 
 * 4. run producer and consumer
 * 
 * <pre>
 * @see com.i2yun.kafka.mq.SimpleConsumer
 * @see com.i2yun.kafka.mq.SimpleProducer
 * </pre>
 * 
 * 
 * @author i2kwing
 *
 */
public class WordCountApplication {
	public static void main(final String[] args) throws Exception {
		Properties props = new Properties();
		// Kafka Streams requires at least the following properties "application.id","bootstrap.servers"
		// each stream has unique id @see http://kafka.apache.org/21/documentation/streams/tutorial
		props.put(StreamsConfig.APPLICATION_ID_CONFIG, "wordcount-application");
		// config kafka location
		// {@see CommonClientConfigs#BOOTSTRAP_SERVERS_DOC}, config "ubuntu-02 192.168.78.132" in the hosts file in advance
		props.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, "ubuntu-02:9092");
		// provide defualt serdes(serializer and deserializer)
		props.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass());
		props.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass());

		// config the processor with DSL(Domain Specified Language)
		StreamsBuilder builder = new StreamsBuilder();
		// create a source stream from a Kafka topic named TextLinesTopic
		KStream<String, String> textLines = builder.stream("TextLinesTopic");
		KTable<String, Long> wordCounts = textLines
				// Create new KStream: by implements {@link ValueMapper} transform map textLine to String array
				.flatMapValues(textLine -> Arrays.asList(textLine.toLowerCase().split("\\W+")))
				// select value as key
				.selectKey((key, word) -> word)
				.groupByKey()
				// count the number of records in this new stream. use default store
				.count(Materialized.as("counts-store"));

		// write this new kstream into another Kafka topic named WordsWithCountsTopic
		wordCounts.toStream().to("WordsWithCountsTopic", Produced.with(Serdes.String(), Serdes.Long()));

		// inspect what kinds of topology is created
		Topology topology = builder.build();
		System.out.println(topology.describe());

		// create kafka stream 
		KafkaStreams streams = new KafkaStreams(topology, props);
		streams.start();
	}

}

生产数据 

SimpleProducer:数据来源(生产数据:将文本文件中的每一行发送到指定topic)

package com.i2yun.kafka.mq;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.Properties;

import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.Producer;
import org.apache.kafka.clients.producer.ProducerRecord;

public class SimpleProducer {
	public static void main(String[] args) throws Exception {

		// Assign topicName to string variable
		String topicName = "TextLinesTopic";

		// create instance for properties to access producer configs
		Properties props = new Properties();

		// Assign localhost id, 参考http://kafka.apache.org/documentation/#producerapi
		props.put("bootstrap.servers", "192.168.78.132:9092");

		// Set acknowledgements for producer requests.
		props.put("acks", "all");

		// If the request fails, the producer can automatically retry,
		props.put("retries", 0);

		// Specify buffer size in config
		props.put("batch.size", 16384);

		// Reduce the no of requests less than 0
		props.put("linger.ms", 1);

		// The buffer.memory controls the total amount of memory available to the
		// producer for buffering.
		props.put("buffer.memory", 33554432);

		props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");

		props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");

		Producer<String, String> producer = new KafkaProducer<String, String>(props);

		// read a txt file , send it line by line
		File file = new File("doc\\testFolder\\wordCount.txt");
		BufferedReader reader = new BufferedReader(new FileReader(file));
		String tempString = null;
		while ((tempString = reader.readLine()) != null) {
			producer.send(new ProducerRecord<String, String>(topicName, tempString));
			Thread.sleep(1000);
		}
		reader.close();

		/*
		 * for (int i = 0; i < 10; i++) { producer.send(new ProducerRecord<String, String>(topicName, Integer.toString(i), Integer.toString(i))); }
		 */
		System.out.println("Message sent successfully");
		producer.close();
	}
}

消费数据 

SimpleConsumer: 消费数据(从新的topic中消费数据:打印key-value)

package com.i2yun.kafka.mq;

import java.time.Duration;
import java.util.Arrays;
import java.util.Properties;

import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;

public class SimpleConsumer {
	public static void main(String[] args) throws Exception {
		// Kafka consumer configuration settings
		String topicName = "WordsWithCountsTopic";
		Properties props = new Properties();

		props.put("bootstrap.servers", "192.168.78.132:9092");
		props.put("group.id", "test");
		props.put("enable.auto.commit", "true");
		props.put("auto.commit.interval.ms", "1000");
		props.put("session.timeout.ms", "30000");
		props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
		props.put("value.deserializer", "org.apache.kafka.common.serialization.LongDeserializer");
		KafkaConsumer<String, String> kafkaConsumer = new KafkaConsumer<String, String>(props);
		// Kafka Consumer subscribes list of topics here.
		kafkaConsumer.subscribe(Arrays.asList(topicName));

		while (true) {
			ConsumerRecords<String, String> records = kafkaConsumer.poll(Duration.ofMillis(100));
			for (ConsumerRecord<String, String> record : records) {
				// print the offset,key and value for the consumer records.
				System.out.printf("offset = %d, key = %s, value = %s\n", record.offset(), record.key(), record.value());
			}
		}
	}
}

数据源

wordCount.txt

Properties KStream KTable lambda 
KStream KTable lambda lambda
KStream
Properties KStream KTable lambda 
KStream KTable lambda lambda
KStream
Properties KStream KTable lambda 
KStream KTable lambda lambda
KStream
Properties KStream KTable lambda 
KStream KTable lambda lambda
KStream
Properties KStream KTable lambda 
KStream KTable lambda lambda
KStream
Properties KStream KTable lambda 
KStream KTable lambda lambda
KStream

输出内容:由于多次修改wordCount.txt,并运行producer, 而consumer消费的topic内容持续保存,因此以下内容和第一次运行时不一致 ,这是正确的

offset = 329, key = properties, value = 14
offset = 330, key = ktable, value = 28
offset = 331, key = lambda, value = 42
offset = 332, key = kstream, value = 42

 

  • 2
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值