1.Flink读取Text文件,实现Word Count示例
package com.mkluo.flink
import org.apache.flink.api.scala.ExecutionEnvironment
/**
* @author luomingkui
* @date 2019/7/7 下午2:44
* @desc 批处理的WorldCount
*/
object BatchWcApp {
def main(args: Array[String]): Unit = {
//1.构造执行环境
val env = ExecutionEnvironment.getExecutionEnvironment
// 其中flatMap 和Map 需要引入隐式转换
import org.apache.flink.api.scala._
//2.source
val ds: DataSet[String] = env.readTextFile("/Users/luomingkui/Downloads/CaseData/test/helloworlds.txt")
//3.经过groupby进行分组,sum进行聚合
val aggDs: AggregateDataSet[(String, Int)] = ds.flatMap(_.split(" ")).map((_, 1)).groupBy(0).sum(1)
//4.打印
aggDs.print()
//4.输出到?点地点
aggDs.writeAsCsv("/Users/luomingkui/Downloads/data")
env.execute()
}
}
2.Flink读取Socket流,实现Word Count示例
package com.mkluo.flink
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
/**
* @author luomingkui
* @date 2019/7/7 下午4:36
* @desc 流处理的WorldCount
*/
object StreamWcApp {
def main(args: Array[String]): Unit = {
//从外部命令中获取参数
//val tool: ParameterTool = ParameterTool.fromArgs(args)
//val host: String = tool.get("hadoop102")
//val port: Int = tool.get("7777").toInt
//创建流处理环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
// 设置并行度
env.setParallelism(2)
//接收socket文本流
val textDstream: DataStream[String] = env.socketTextStream("hadoop102",7777)
// flatMap和Map需要引用的隐式转换
import org.apache.flink.api.scala._
//处理 分组并且sum聚合
val dStream: DataStream[(String, Int)] = textDstream.flatMap(_.split(" ")).filter(_.nonEmpty).map((_,1)).keyBy(0).sum(1)
//打印
dStream.print()
env.execute()
}
}
• 测试:在linux系统中用 nc -lk 7777 进行发送测试