【尚硅谷Java版】Flink1.13之自定义Source
上一篇文章(地址: https://blog.csdn.net/junR_980218/article/details/125798564)我们介绍了 五种Flink读取数据源的方式,而那些数据源都都是现成的,我们这一节将介绍
用户自定义数据源
。
一、基础环境
1、项目环境搭建可以参考:https://blog.csdn.net/junR_980218/article/details/125366210
2、基础环境搭建好之后,创建Event实体类
package com.atguigu.chapter05;
import java.sql.Timestamp;
/**
* @author potential
*/
public class Event {
/**
* 这里我们需要注意以下几点:
* 1、类必须是公有的
* 2、所有属性都是公有的
* 3、所有属性的类型都是可以序列化的
*/
public String user;
public String url;
public Long timestamp;
public String getUser() {
return user;
}
public void setUser(String user) {
this.user = user;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public Long getTimestamp() {
return timestamp;
}
public void setTimestamp(Long timestamp) {
this.timestamp = timestamp;
}
public Event() {
}
public Event(String user, String url, Long timestamp) {
this.user = user;
this.url = url;
this.timestamp = timestamp;
}
@Override
public String toString() {
return "Event{" +
"user='" + user + '\'' +
", url='" + url + '\'' +
", timestamp=" + new Timestamp(timestamp) +
'}';
}
}
3、自定义数据
(1)自定义单行数据——实现简单的sourceFounction,串行的读取数据,并行度只能为1,吞吐量很小
package com.atguigu.chapter05;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import java.util.Calendar;
import java.util.Random;
/**
* @author potential
*/
public class ClickSource implements SourceFunction<Event> {
/**
* 声明一个标志位
*/
private Boolean running=true;
@Override
public void run(SourceContext<Event> sourceContext) throws Exception {
//随机生成数据
Random random = new Random();
//定义字段选取的数据集
String[] users={"Mary","Alice","Bob","Cary"};
String[] urls={"./home","./cart","./fav","./prod?id=100,/prod?id=10"};
//循环不停的生成数据
while(running){
String user=users[random.nextInt(users.length)];
String url=urls[random.nextInt(urls.length)];
//获取当前时间Calendar.getInstance() 获取毫秒数getTimeInMillis()
Long timestamp= Calendar.getInstance().getTimeInMillis();
sourceContext.collect(new Event(user,url,timestamp));
Thread.sleep(1000L);
}
}
@Override
public void cancel() {
running=false;
}
}
测试:
package com.atguigu.chapter05;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.ParallelSourceFunction;
import java.util.OptionalInt;
import java.util.Random;
/**
* @author potential
*/
public class SourceCustomTest {
public static void main(String[] args) throws Exception {
//1、获取执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//将全局的并行度设置为1
env.setParallelism(1);
//调用addSource方法 里面要传入的是自定义的source方法
DataStreamSource<Event> customStream = env.addSource(new ClickSource());
customStream.print();
env.execute();
}
结果:
(2)自定义并行数据
定义并行数据并且测试——实现ParallelSourceFunction,提高并行度,可以通过setParallelism方法设置并行度,提高吞吐量
package com.atguigu.chapter05;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.ParallelSourceFunction;
import java.util.OptionalInt;
import java.util.Random;
/**
* @author potential
*/
public class SourceCustomTest {
public static void main(String[] args) throws Exception {
//1、获取执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//将全局的并行度设置为1
env.setParallelism(1);
//实现自定义的并行SourceFounction
DataStreamSource<Integer> customStream = env.addSource(new ParallelCustomSource()).setParallelism(2);
customStream.print();
env.execute();
}
public static class ParallelCustomSource implements ParallelSourceFunction<Integer>{
private Boolean running=true;
private Random random =new Random();
@Override
public void run(SourceContext<Integer> sourceContext) throws Exception {
while(running){
sourceContext.collect(random.nextInt());
}
}
@Override
public void cancel() {
running=false;
}
}
}
测试结果: