DataStreamSource 读取hdfs
package com.umetrip.umeflink.connector.hdfs;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import java.util.Arrays;
import java.util.List;
public class UmeHdfsSource {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env1 = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSource<String> stringDataStreamSource = env1.readTextFile("hdfs://10.5.xxx.xxx:8020/test/seamless/2019-06-25/part-0-0.txt");
// stringDataStreamSource.print();
stringDataStreamSource.writeAsText("/Users/xxxx/testdata/").setParallelism(1);
env1.execute();
}
}
DataSet 批处理
package com.umetrip.umeflink.connector.hdfs;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import java.util.Arrays;
import java.util.List;
public class UmeHdfsSource {
public static void main(String[] args) throws Exception {
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// create a configuration object
Configuration parameters = new Configuration();
// set the recursive enumeration parameter
parameters.setBoolean("recursive.file.enumeration", true);
// read text file from a HDFS running at nnHost:nnPort
// DataSet<String> hdfsLines = env.readTextFile("hdfs://10.5.xxx.xxx:8020/TestData");
// 如果本地读取HA HDFS,需要把core-site.xml,hdfs-site.xml放到resources目录下,路径为hdfs://namespace/xx/xx
//DataSet<String> hdfsLines = env.readTextFile("hdfs://10.5.xxx.xx:8020/test/seamless/2019-06-25");
DataSet<String> hdfsLines = env.readTextFile("hdfs://10.5.xxx.xxx:8020/test/seamless/2019-06-25").withParameters(parameters).withParameters(parameters);
hdfsLines.print();
}
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-hadoop-compatibility_2.11</artifactId>
<version>1.8.0</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>1.8.0</version>
<!--<scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_2.12</artifactId>
<!--<scope>provided</scope>-->
<version>1.8.0</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_2.12</artifactId>
<version>1.8.0</version>
<!--<scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-filesystem_2.12</artifactId>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-hadoop-fs</artifactId>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-shaded-hadoop2</artifactId>
</dependency>