1.引入依赖
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-hadoop-compatibility -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-hadoop-compatibility_2.12</artifactId>
<version>1.14.4</version>
<scope>test</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.3.2</version>
</dependency>
2.创建hdfs文件
[root@cloudtest02 hadoop-3.2.0]# cat test.txt
java,python,c++
java,python,c#
创建目录
[root@cloudtest02 hadoop-3.2.0]# hdfs dfs -mkdir -p /root/hadoop
将文件放进目录
[root@cloudtest02 hadoop-3.2.0]# hdfs dfs -put test.txt /root/hadoop
3.从hdfs读取数据
@Test
public void fromHdfsTest() throws Exception {
// 执行环境
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
//添加 hdfs 数据源
env.readTextFile("hdfs://172.16.10.159:9000/root/input/test.txt")
//扁平化
.flatMap(new FlatMapFunction<String, String>() {
@Override
public void flatMap(String value, Collector<String> out) throws Exception {
Arrays.stream(value.split(",")).forEach(v -> out.collect(v));
}
})
//映射
.map(new MapFunction<String, Tuple2<String, Integer>>() {
@Override
public Tuple2<String, Integer> map(String value) throws Exception {
return Tuple2.of(value, 1);
}
})
//分组
.groupBy((KeySelector<Tuple2<String, Integer>, String>) value -> value.f0)
//求和
.reduce((v1, v2) -> new Tuple2<String, Integer>(v1.f0, v1.f1 + v2.f1))
//打印结果
.print();
}
结果:
(c#,1)
(java,3)
(python,3)
(c++,2)