文章目录
直接将kafka topic1中的数据传递给topic2
样例1
package kafka;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.common.serialization.Serdes;
import org.apache.kafka.streams.*;
import org.apache.kafka.streams.kstream.KStream;
import java.util.ArrayList;
import java.util.Properties;
import java.util.concurrent.CountDownLatch;
public class EventAttendStream {
public static void main(String[] args) {
Properties prop = new Properties();
prop.put(StreamsConfig.APPLICATION_ID_CONFIG,"eventattend");//一个应用一个组
prop.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG,"192.168.232.211:9092");
prop.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass());
prop.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG,Serdes.String().getClass());
prop.put(StreamsConfig.COMMIT_INTERVAL_MS_CONFIG,300);//间隔
prop.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG,"false");//是否自动提交
//earliest latest none
prop.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,"earliest");
StreamsBuilder builder = new StreamsBuilder();
KStream<Object, Object> source = builder.stream("event_attendees_row");
source.flatMap((key,value)->{
//value -> event,yes,maybe,invited,no
String[] fields = value.toString().split(",");
ArrayList<KeyValue<String,String>> list = new ArrayList<>();
if (fields.length>=2 && fields[1].trim().length()>0) {
String[] yes = fields[1].split(" ");
for (String s : yes) {
System.out.println(fields[0] + "," + s + ",yes");
KeyValue<String, String> yesKeyValue = new KeyValue<>(null, fields[0] + "," + s + ",yes");
list.add(yesKeyValue);
}
}
if (fields.length>=3 && fields[2].trim().length()>0){
String[] maybe = fields[2].split(" ");
for (String s : maybe) {
System.out.println(fields[0] + "," + s + ",maybe");
KeyValue<String, String> maybeKeyValue = new KeyValue<>(null, fields[0] + "," + s + ",maybe");
list.add(maybeKeyValue);
}
}
if (fields.length>=4 && fields[3].trim().length()>0){
String[] invited = fields[3].split(" ");
for (String s : invited) {
System.out.println(fields[0] + "," + s + ",invited");
KeyValue<String, String> invitedKeyValue = new KeyValue<>(null, fields[0] + "," + s + ",invited");
list.add(invitedKeyValue);
}
}
if (fields.length>=5 && fields[4].trim().length()>0){
String[] no = fields[4].split(" ");
for (String s : no) {
System.out.println(fields[0] + "," + s + ",no");
KeyValue<String, String> noKeyValue = new KeyValue<>(null, fields[0] + "," + s + ",no");
list.add(noKeyValue);
}
}
return list;
}).to("event_attendees_1");
Topology topo = builder.build();
KafkaStreams streams = new KafkaStreams(topo,prop);
CountDownLatch latch = new CountDownLatch(1);
Runtime.getRuntime().addShutdownHook(new Thread("stream"){
public void run(){
streams.close();
latch.countDown();
}
});
streams.start();
try {
latch.await();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
样例2
package kafka;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.common.serialization.Serdes;
import org.apache.kafka.streams.*;
import org.apache.kafka.streams.kstream.KStream;
import java.util.ArrayList;
import java.util.Properties;
import java.util.concurrent.CountDownLatch;
public class UserFriendStream {
public static void main(String[] args) {
Properties prop = new Properties();
prop.put(StreamsConfig.APPLICATION_ID_CONFIG,"userfriend1");//一个应用一个组
prop.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG,"192.168.232.211:9092");
prop.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass());
prop.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG,Serdes.String().getClass());
prop.put(StreamsConfig.COMMIT_INTERVAL_MS_CONFIG,300);//间隔
prop.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG,"false");//是否自动提交
//earliest latest none
prop.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,"earliest");
StreamsBuilder builder = new StreamsBuilder();
//user,friends=> 3238005,47949549 68056805
KStream<Object, Object> source = builder.stream("user_friends");
KStream<String, String> user_friends = source.flatMap((key, value) -> {
ArrayList<KeyValue<String, String>> list = new ArrayList<>();
String[] fields = value.toString().split(",");
if (fields.length == 2) {
String[] friends = fields[1].split("\\s+");
String user = fields[0];
if (user.trim().length() > 0) {
for (String friend : friends) {
System.out.println(user + "," + friend);
KeyValue<String, String> keyValue = new KeyValue<>(null, user + "," + friend);
list.add(keyValue);
}
}
}
return list;
});
user_friends.to("userfriends");
Topology topo = builder.build();
KafkaStreams streams = new KafkaStreams(topo,prop);
CountDownLatch latch = new CountDownLatch(1);
Runtime.getRuntime().addShutdownHook(new Thread("stream"){
public void run(){
streams.close();
latch.countDown();
}
});
streams.start();
try {
latch.await();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
kafka stream 窗口
流式数据是在时间上无界的数据。而聚合操作只能作用在特定的数据集,也即有界的数据集上。因此需要通过某种方式从无界的数据集上按特定的语义选取出有界的数据。窗口是一种非常常用的设定计算边界的方式。不同的流式处理系统支持的窗口类似,但不尽相同。
Hopping Time Window
跳跃时间窗口,它有两个属性,一个是Window size,一个是Advance interval。Window size指定了窗口的大小,也即每次计算的数据集的大小。而Advance interval定义输出的时间间隔。应用开始运行的时间就是第一个窗口的起始时间,然后每经过一个advance interval便会创建一个新的窗口,同时每个窗口的宽度都是size(时间上的宽度)
Tumbling time window
滚动时间窗口,是跳跃时间窗口的一种特例,当跳跃时间窗口的size和advance iterval值相等时,它就变成了滚动时间窗口。
滚动时间窗口只有一个参数:size,表示窗口的尺寸,一个窗口的结束点会是下一个窗口的起始点。窗口之间没有间隙,也不重叠。
Session window
该窗口用于对Key做Group后的聚合操作中。它需要对Key做分组,然后对组内的数据根据业务需求定义一个窗口的起始点和结束点。一个典型的案例是,希望通过Session Window计算某个用户访问网站的时间。对于一个特定的用户(用Key表示)而言,当发生登录操作时,该用户(Key)的窗口即开始,当发生退出操作或者超时时,该用户(Key)的窗口即结束。窗口结束时,可计算该用户的访问时间或者点击次数等。
Sliding Window
该窗口只用于2个KStream进行Join计算时。该窗口的大小定义了Join两侧KStream的数据记录被认为在同一个窗口的最大时间差。假设该窗口的大小为5秒,则参与Join的2个KStream中,记录时间差小于5的记录被认为在同一个窗口中,可以进行Join计算。
package kafka;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.common.serialization.Serdes;
import org.apache.kafka.streams.*;
import org.apache.kafka.streams.kstream.*;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Properties;
import java.util.concurrent.CountDownLatch;
public class SessionDemo {
public static void main(String[] args) {
Properties prop = new Properties();
prop.put(StreamsConfig.APPLICATION_ID_CONFIG,"windowdemo3");//一个应用一个组
prop.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG,"192.168.232.211:9092");
prop.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass());
prop.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG,Serdes.String().getClass());
prop.put(StreamsConfig.COMMIT_INTERVAL_MS_CONFIG,300);//间隔
prop.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG,"false");//是否自动提交
//earliest latest none
prop.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,"earliest");
StreamsBuilder builder = new StreamsBuilder();
SessionWindowedKStream<String, String> windowdemo1 = builder.stream("windowdemo")
// .flatMapValues((value) -> {
// String[] split = value.toString().split("\\s+");
// return Arrays.asList(split);
// })
.flatMap((key, value) -> {
ArrayList<KeyValue<String, String>> keyValues = new ArrayList<>();
return keyValues;
})
.map((key, value) -> {
return new KeyValue<String, String>(value, "1");
}).groupByKey()
// .windowedBy(TimeWindows.of(Duration.ofSeconds(15).toMillis()));//跳跃
// .windowedBy(TimeWindows.of(Duration.ofSeconds(15).toMillis())
// .advanceBy(Duration.ofSeconds(5).toMillis()));//滑动
.windowedBy(SessionWindows.with(Duration.ofSeconds(15).toMillis()));
//SessionWindow
KStream<Windowed<String>, Long> windowedLongKStream = windowdemo1.count().toStream();
windowedLongKStream.foreach((key,value)->{
System.out.println("key:" + key + "value:" + value);
});
Topology topo = builder.build();
KafkaStreams streams = new KafkaStreams(topo,prop);
CountDownLatch latch = new CountDownLatch(1);
Runtime.getRuntime().addShutdownHook(new Thread("stream"){
public void run(){
streams.close();
latch.countDown();
}
});
streams.start();
try {
latch.await();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}