一、新建maven项目并引入依赖
<!--java版依赖-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_${scala.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<!--scala版依赖-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-scala_${scala.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_${scala.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<!--Flink默认使用的是slf4j记录日志,使用log4j作为具体的日志实现-->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.25</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.25</version>
</dependency>
<dependency>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-to-slf4j</artifactId>
<version>2.13.2</version>
</dependency>
二、Java 版
1、Batch批处理
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.AggregateOperator;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.operators.FlatMapOperator;
import org.apache.flink.api.java.operators.UnsortedGrouping;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;
public class BatchWordCount {
public static void main(String[] args) throws Exception {
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSource<String> inputDS = env.readTextFile("input/input.txt");
FlatMapOperator<String, Tuple2<String, Long>> wordAndOne = inputDS.flatMap((String line, Collector<Tuple2<String, Long>> out) -> {
String[] words = line.split(" ");
for (String word : words) {
out.collect(Tuple2.of(word, 1L));
}
}).returns(Types.TUPLE(Types.STRING, Types.LONG));
UnsortedGrouping<Tuple2<String, Long>> wordCountUG = wordAndOne.groupBy(0);
AggregateOperator<Tuple2<String, Long>> wordCount = wordCountUG.sum(1);
wordCount.print();
}
}
2、DataStream流处理
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
import java.util.Arrays;
public class DataStreamWordCount {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStreamSource<String> inputDS = env.readTextFile("input/input.txt");
SingleOutputStreamOperator<Tuple2<String, Long>> wordAndOneDS = inputDS
.flatMap((String line, Collector<String> words) -> { Arrays.stream(line.split(" ")).forEach(words::collect); })
.returns(Types.STRING)
.map(word -> Tuple2.of(word, 1L))
.returns(Types.TUPLE(Types.STRING, Types.LONG));
KeyedStream<Tuple2<String, Long>, String> wordCountKS = wordAndOneDS.keyBy(t -> t.f0);
SingleOutputStreamOperator<Tuple2<String, Long>> wordCount = wordCountKS.sum(1);
wordCount.print();
env.execute();
}
}
三、Scala 版
1、Batch批处理
import org.apache.flink.api.scala._
object BatchWordCount {
def main(args: Array[String]): Unit = {
val env: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
val inputDS: DataSet[String] = env.readTextFile("input/input.txt")
val wordAndOne: DataSet[(String, Int)] = inputDS.flatMap(_.split(" ")).map(word => (word, 1))
val wordCountGp: GroupedDataSet[(String, Int)] = wordAndOne.groupBy(0)
val wordCount: AggregateDataSet[(String, Int)] = wordCountGp.sum(1)
wordCount.print()
}
}
2、DataStream流处理
import org.apache.flink.streaming.api.scala._
object DataStreamWordCount {
def main(args: Array[String]): Unit = {
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
val inputDS: DataStream[String] = env.readTextFile("input/input.txt")
val wordAndOneDS: DataStream[(String, Int)] = inputDS.flatMap(_.split(" ")).map(word => (word, 1))
val wordCountKS: KeyedStream[(String, Int), String] = wordAndOneDS.keyBy(_._1)
val wordCount= wordCountKS.sum(1)
wordCount.print()
env.execute()
}
}