今天开始学习Flink,入手第一个Flink数据集demo
功能:把文件中的单词进行分词,统计个数
基本思路:读取文件内容-》数据集-》数据集统计二元组输出(word,count)
1、构建maven Flink项目
引入Flink依赖
<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>1.10.1</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_2.12</artifactId>
<version>1.10.1</version>
</dependency>
</dependencies>
2、 添加单词文件
文件内容:
3、进行分词处理
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;
import scala.Int;
/**
* 单次统计
* 批处理
*/
public class WordCount {
public static void main(String[] args) throws Exception{
//创建执行环境
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
//从文件中读取数据
String inputPath = "D:\\idle\\FlinkTest\\src\\main\\resources\\word.txt";
DataSet<String> inputDataSet = env.readTextFile(inputPath);
//对数据集进行处理,按照空格进行分词,转换成(word,1)二元组进行统计
DataSet<Tuple2<String, Integer>> resultSet = inputDataSet.flatMap(new MyflatMapper())
.groupBy(0) //按照第一个位置的word分组
.sum(1); //讲第二个位置上的数据求和
resultSet.print();
}
//自定义类,实现FlatMapFunction接口
public static class MyflatMapper implements FlatMapFunction<String, Tuple2<String, Integer>>{
public void flatMap(String s, Collector<Tuple2<String, Integer>> collector) throws Exception {
//按空格分词
String[] words = s.split(" ");
//遍历所有word,包成二元组输出
for(String word:words){
collector.collect(new Tuple2<String, Integer>(word, 1));
}
}
}
}