1.1 导入Maven依赖
pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>flink</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<encoding>UTF-8</encoding>
<scala.version>2.11.8</scala.version>
<scala.main.version>2.11</scala.main.version>
<flink.version>1.9.1</flink.version>
<!--hadoop版本统一管理-->
<hadoop.version>2.7.6</hadoop.version>
</properties>
<dependencies>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<!-- Test -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.8.1</version>
<scope>test</scope>
</dependency>
<!-- hdfs -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!--flink-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-scala_${scala.main.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_${scala.main.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<!--flink kafka-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka_${scala.main.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<!--flink table & sql-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-scala-bridge_${scala.main.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner_${scala.main.version}</artifactId>
<version>${flink.version}</version>
</dependency>
</dependencies>
<build>
<sourceDirectory>src/main/scala</sourceDirectory>
<testSourceDirectory>src/test/scala</testSourceDirectory>
<plugins>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<version>2.15.0</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
<configuration>
<args>
<!--<arg>-make:transitive</arg>-->
<arg>-dependencyfile</arg>
<arg>${project.build.directory}/.scala_dependencies</arg>
</args>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.6</version>
<configuration>
<useFile>false</useFile>
<disableXmlReport>true</disableXmlReport>
<!-- If you have classpath issue like NoDefClassError,... -->
<!-- useManifestOnlyJar>false</useManifestOnlyJar -->
<includes>
<include>**/*Test.*</include>
<include>**/*Suite.*</include>
</includes>
</configuration>
</plugin>
</plugins>
</build>
</project>
1.2 a.txt(输入源文件)
hello word! flink demo,
hello$ flink flink
1.3 wordcount简单实列
package bounded
import org.apache.flink.api.scala.ExecutionEnvironment
object Demo_01 {
def main(args: Array[String]): Unit = {
//执行环境
val env = ExecutionEnvironment.getExecutionEnvironment
//导入单例类Scala中隐式成员
import org.apache.flink.api.scala._
//WordCount计算步骤
env.readTextFile("input/a.txt")
.map(_.replaceAll("\\pP|\\pS",""))//去除标点以及符号
.flatMap(_.split("\\s+"))//按空格换行把单词分离出来
.filter(_.nonEmpty)//去控
.map((_,1))//把每个单词组装成一个元组(单词,1)
.groupBy(0)//按单词进行分组
.sum(1)//对每个单词出现的数量进行聚合统计
.print()
}
}
1.4 输出
(demo,1)
(flink,3)
(word,1)
(hello,2)
2.1 wordcount输入输出源都在hdfs(常用)
package com.yuanhanhan.buounded
import org.apache.flink.api.java.utils.ParameterTool
import org.apache.flink.api.scala.ExecutionEnvironment
object Demo_02 {
def main(args: Array[String]): Unit = {
//拦截非法参数 提醒输入参数格式
if(args == null || args.length != 4){
println(
"""
|Warning: Please enter parameters!
|Example: --input inputPath --output outputPath
""".stripMargin)
sys.exit(-1)
}
//获得输入参数并解析得到输入输出路径
val tool: ParameterTool = ParameterTool.fromArgs(args)
val inputPath = tool.get("input")
val outputPath = tool.get("output")
//执行环境
val env = ExecutionEnvironment.getExecutionEnvironment
//导入隐式成员
import org.apache.flink.api.scala._
//wordcount计算
env.readTextFile(inputPath)
.map(_.replaceAll("\\pP||\\pS", ""))
.flatMap(_.split("\\s"))
.map((_,1))
.groupBy(0)
.sum(1)
.writeAsText(outputPath)
.setParallelism(1)//设置并行度为1 产生一个文件
//触发执行(不写这句只是懒加载,不会产生结果)
env.execute(this.getClass.getSimpleName)
}
}
2.2 把代码打成jar包上传集群,创建hdfs输入输出文件目录
用idea将项目打包
模拟在hdfs上传输入输出路径,把输入文件上传hdfs
[root@master ~]# hdfs dfs -mkdir -p /flink/input
[root@master ~]# hdfs dfs -put /root/input/a.txt /flink/input/
[root@master ~]# hdfs dfs -mkdir -p /flink/output
用flink-run方式提交job
flink run -m yarn-cluster -yn 2 -c com.yuanhanhan.buounded.Demo_02 -d /root/jar/flink-1.0-SNAPSHOT.jar --input hdfs://cluster/flink/input/a.txt --output hdfs://cluster/flink/output/a_output
//目录中的cluster这里是高可用hdfs的入口 不是目录
验证:
[root@master jar]# hdfs dfs -cat /flink/output/a_output
(demo,1)
(flink,3)
(hello,2)
(word,1)
注意:提交任务到集群上运行每个必须添加一个jar包,否则报错
积分下载地址:download
3.1 端口监听输入数据 wordcount实时处理数据
代码
package com.yuanhanhan.unbounded
import org.apache.flink.api.java.utils.ParameterTool
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
object Demo_03 {
def main(args: Array[String]): Unit = {
if(args == null || args.length !=4){
println(
"""
|Warning: Please enter parameters!
|Example: --host 192.168.170.152 --port 6666
""".stripMargin)
}
//获得输入参数
val tool = ParameterTool.fromArgs(args)
val hostname = tool.get("hostname")
val port = tool.getInt("port")
//执行环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
//导入隐式成员
import org.apache.flink.api.scala._
//计算
env.socketTextStream(hostname, port)
.map(_.replaceAll("\\pP|\\pS", ""))
.flatMap(_.split("\\s+"))
.map((_,1))
.keyBy(0)
.sum(1)
.print()
//执行
env.execute(this.getClass.getSimpleName)
}
}
让程序执行的时候携带参数
创建端口并往端口输入数据
[root@master ~]# nc -lk 6666
hello word hello flink
hello flink
验证结果