1.在Idea工程根目录下新建一个 input 文件夹,并在下面创建文本文件 words.txt
2.在 words.txt 中输入一些文字,例如:
hello world
hello flink
hello java
3.在 com.atguigu.chapter02 包下新建 Java 类 BatchWordCount,在静态 main 方法中编
写测试代码。
我们进行单词频次统计的基本思路是:先逐行读入文件数据,然后将每一行文字拆分成单
词;接着按照单词分组,统计每组数据的个数,就是对应单词的频次。
package com.atguigu.wc;
import akka.routing.RoutedActorCell;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.AggregateOperator;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.operators.FlatMapOperator;
import org.apache.flink.api.java.operators.UnsortedGrouping;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;
import java.util.Collection;
import java.util.concurrent.ExecutionException;
public class BatchWordCount {
public static void main(String[] args) throws Exception{
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSource<String> lineDataSource = env.readTextFile("input/words.txt");
FlatMapOperator<String ,Tuple2<String, Long>> wordAndOneTuple = lineDataSource.flatMap((String line, Collector<Tuple2<String, Long>> out) -> {
String[] words = line.split(" ");
for (String word:words){
out.collect(Tuple2.of(word,1L));
}
})
.returns(org.apache.flink.api.common.typeinfo.Types.TUPLE(org.apache.flink.api.common.typeinfo.Types.STRING, Types.LONG));
UnsortedGrouping<Tuple2<String,Long>> wordAndOneGroup = wordAndOneTuple.groupBy(0);
AggregateOperator<Tuple2<String, Long>> sum = wordAndOneGroup.sum(1);
sum.print();
}
}