依赖
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-hadoop-compatibility_2.11</artifactId>
<version>1.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.2</version>
</dependency>
在resource目录下放三个文件
第一:hdfs-site.xml
<configuration>
<-- 块副本数,默认为3 -->
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
<-- secondarynamenode的地址 辅助namenode工作 -->
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>bigdata:9001</value>
</property>
<-- 指定一个本地的文件系统,决定NameNode 在何处存放fsimage -->
<property>
<name>dfs.namenode.name.dir</name>
<value>file:/opt/media/hadoop-2.6.0/dfs/name</value>
</property>
<-- 设置 DataNode 节点存储数据文件的本地路径,可以通过逗号分隔指定多个路径 -->
<property>
<name>dfs.datanode.data.dir</name>
<value>file:/opt/medis/hadoop-2.6.0/dfs/data</value>
</property>
<-- 外域HDFS客户端访问内网HDFS datanode 以下两个 -->
<property>
<name>dfs.client.use.datanode.hostname</name>
<value>true</value>
</property>
<property>
<name>dfs.datanode.use.datanode.hostname</name>
<value>true</value>
</property>
</configuration>
第二:core-site.xml
<configuration>
<!-- 指定NameNode的地址 -->
<property>
<name>fs.deaultFS</name>
<value>hdfs://hadoop100:8020</value>
</property>
<!-- 指定hadoop数据的存储目录 -->
<property>
<name>hadoop.tmp.dir</name>
<value>/opt/module/hadoop-3.1.3/data</value>
</property>
<!-- 指定hadoop数据的存储目录 -->
<property>
<name>hadoop.http.staticuser.user</name>
<value>root</value>
</property>
</configuration>
第三:log4j.properties
#将等级为DEBUG的日志信息输出到console和file这两个目的地,console和file的定义在下面的代码
log4j.rootLogger=DEBUG,console,file
#控制台输出的相关设置
log4j.appender.console = org.apache.log4j.ConsoleAppender
log4j.appender.console.Target = System.out
log4j.appender.console.Threshold=DEBUG
log4j.appender.console.layout = org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=[%c]-%m%n
#文件输出的相关设置
log4j.appender.file = org.apache.log4j.RollingFileAppender
log4j.appender.file.File=./log/logFile.log
log4j.appender.file.MaxFileSize=10mb
log4j.appender.file.Threshold=DEBUG
log4j.appender.file.layout=org.apache.log4j.PatternLayout
log4j.appender.file.layout.ConversionPattern=[%p][%d{yy-MM-dd}][%c]%m%n
#日志输出级别
log4j.logger.org.mybatis=DEBUG
log4j.logger.java.sql=DEBUG
log4j.logger.java.sql.Statement=DEBUG
log4j.logger.java.sql.ResultSet=DEBUG
log4j.logger.java.sql.PreparedStatement=DEBUG
代码
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
public class StreamFromFile {
public static void main(String[] args) throws Exception {
// 文件拥有者的名称,通过ui界面即可获得
System.setProperty("HADOOP_USER_NAME","user");
// 9000为内部文件通信端口
String inputHdfs = "hdfs://ip:9000/bigdata.txt";
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSource<String> data = env.readTextFile(inputHdfs);
SingleOutputStreamOperator<Tuple2<String, Integer>> wordAndOne = data.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
public void flatMap(String s, Collector<Tuple2<String, Integer>> collector) throws Exception {
for (String word : s.split(" ")) {
collector.collect(Tuple2.of(word, 1));
}
}
});
SingleOutputStreamOperator<Tuple2<String, Integer>> result = wordAndOne.keyBy(0).sum(1);
result.print();
env.execute();
}
}