一、创建基础类
import java.sql.Timestamp;
public class Event {
public String user;
public String url;
public Long timestamp;
public Event() {
}
public Event(String user, String url, Long timestamp) {
this.user = user;
this.url = url;
this.timestamp = timestamp;
}
@Override
public String toString() {
return "Event{" +
"user='" + user + '\'' +
", url='" + url + '\'' +
", timestamp=" + new Timestamp(timestamp) +
'}';
}
}
1、从文件中读取数据
// 创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment();
env.setParallelism(1);
// 1. 从文件中读取数据
DataStreamSource<String> streamFile = env.readTextFile("input/clicks.txt");
streamFile.print();
env.execute();
2. 从集合中读取数据
// 2. 从集合中读取数据
// 从数组集合中读取数据
ArrayList<Integer> nums = new ArrayList<>();
nums.add(2);
nums.add(5);
DataStreamSource<Integer> stream1 = env.fromCollection(nums);
// 从对象集合中读取数据
ArrayList<Event> events = new ArrayList<>();
events.add(new Event("Mary","./home",1000L));
events.add(new Event("Bob","./cart",2000L));
DataStreamSource<Event> stream2 = env.fromCollection(events);
3. 从元素读取数据
// 3. 从元素读取数据
DataStreamSource<Event> stream3 = env.fromElements(
new Event("Mary", "./home", 1000L),
new Event("Bob", "./cart", 2000L)
);
4. 从socket文本流中读取
DataStreamSource<String> stream4 = env.socketTextStream("hadoop102", 9092);
5. 从kafka中读取数据
Properties properties = new Properties();
properties.setProperty("bootstrap.servers", "hadoop102:9092");
properties.setProperty("group.id", "consumer-group");
properties.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
properties.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
properties.setProperty("auto.offset.reset", "latest");
DataStreamSource<String> kafkaStream = env.addSource(new FlinkKafkaConsumer<String>("clicks", new SimpleStringSchema(), properties));
6.从自定义数据源读取数据
自定义数据源
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import java.util.Calendar;
import java.util.Random;
public class ClickSource implements SourceFunction<Event> {
// 声明一个标志位
private Boolean running = true;
@Override
public void run(SourceContext<Event> sourceContext) throws Exception {
// 随机生成数据
Random random = new Random();
// 定义字段选取的数据集
String[] users = {"Mary","Alice","Bob","Cary"};
String[] urls = {"./home","./cart","./fav","./prod?id=100","./prod?id=10"};
// 循环生成数据
while (running) {
String user = users[random.nextInt(users.length)];
String url = urls[random.nextInt(urls.length)];
long timestamp = Calendar.getInstance().getTimeInMillis();
sourceContext.collect(new Event(user,url,timestamp));
Thread.sleep(1000);
}
}
@Override
public void cancel() {
running = false;
}
}
// 从自定义数据源中读取数据
DataStreamSource<Event> clickSource = env.addSource(new ClickSource());
clickSource.print();
7.定义并行数据源
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.ParallelSourceFunction;
import java.util.Random;
/**
* 定义并行数据源
*/
public class SourceCustomTest {
public static void main(String[] args) throws Exception {
// 创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(100);
// DataStreamSource<Event> customStream = env.addSource(new ClickSource());
DataStreamSource<Integer> customStream = env.addSource(new ParallelCustomSource()).setParallelism(2);
customStream.print();
env.execute();
}
public static class ParallelCustomSource implements ParallelSourceFunction<Integer> {
private Boolean running = true;
private Random random = new Random();
@Override
public void run(SourceContext<Integer> sourceContext) throws Exception {
while (running) {
sourceContext.collect(random.nextInt());
}
}
@Override
public void cancel() {
running = false;
}
}
}
该文展示了如何使用ApacheFlink从不同数据源读取数据,包括文件、集合、元素、socket、Kafka和自定义数据源。通过创建基础Event类,文中详细解释了各种读取方法,如读取文本文件、从集合和元素中生成数据流,以及从Kafka消费者和自定义SourceFunction中获取实时流数据。
1077

被折叠的 条评论
为什么被折叠?



