Flink处理数据
目录结构
pom文件
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-scala_2.12</artifactId>
<version>1.9.1</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_2.12</artifactId>
<version>1.9.1</version>
</dependency>
1.模块处理数据
读取本地文件helloword.txt中的数据,并进行wordcount操作
helloword.txt文件中的数据
hello flink
hello scala
hello world
package com.thomas.wc
import org.apache.flink.api.java.utils.ParameterTool
import org.apache.flink.api.scala._
/**
* @author Thomas
* @date 2020/2/9 15:27
* @version 1.0
*/
object WrodCount {
def main(args: Array[String]): Unit = {
val params: ParameterTool = ParameterTool.fromArgs(args)
val path: String = params.get("path")
//创建执行环境
val environment: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
//从文件中读取数据
val inputPath = path
val inputDataSet: DataSet[String] = environment.readTextFile(inputPath)
//进行数据处理
val wordCount = inputDataSet.flatMap(_.split(" ")).map((_, 1)).groupBy(0).sum(1)
//输出
wordCount.print()
}
}
在idea中设置path值
2.流式处理数据
实时处理数据
数据源,使用nc -lk 9999,在虚拟机中开一个端口用于输出数据
package com.thomas.wc
import java.security.Policy.Parameters
import org.apache.flink.api.java.utils.ParameterTool
import org.apache.flink.streaming.api.scala._
/**
* @author Thomas
* @date 2020/2/9 15:41
* @version 1.0
*/
object StreamWordCount {
def main(args: Array[String]): Unit = {
val params: ParameterTool = ParameterTool.fromArgs(args)
val host: String = params.get("host")
val port: Int = params.getInt("port")
//创建流处理执行环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
//接收数据
val dataStream: DataStream[String] = env.socketTextStream(host, port)
//数据处理
val dataStreamSet: DataStream[(String, Int)] = dataStream.flatMap(_.split(" ")).filter(_.nonEmpty).map((_, 1)).keyBy(0).sum(1)
//输出
dataStreamSet.print()
//执行任务
env.execute("Hello Flink Stream WordCount")
}
}