- FlatMap
import org.apache.flink.api.scala.{ExecutionEnvironment, _}
object FlatMapFunction01 {
def main(args: Array[String]): Unit = {
val env = ExecutionEnvironment.getExecutionEnvironment
env.getConfig.disableSysoutLogging()
//创建模拟测试数据
val text = env.fromElements("flink hadoop", "spark hive")
//使用flatMap来进行数据的切割,将每次的数据都作用于该function
val text2 = text.flatMap(_.split("\\s+"))
text2.print()
}
}
- Map
import org.apache.flink.api.scala.{ExecutionEnvironment, _}
object MapFunction01 {
def main(args: Array[String]): Unit = {
//创建运行时环境
val env = ExecutionEnvironment.getExecutionEnvironment
env.getConfig.disableSysoutLogging()
//创建模拟测试数据
val text = env.fromElements("flink hadoop", "spark hive").flatMap(_.split("\\s+"))
//转换为大写,并计算其长度
val res1 = text.map(str => (str.toUpperCase(), str.trim.length))
res1.print()
//使用case class
val res2 = text.map(line => LineCount(line.toUpperCase(), line.length))
res2.print()
}
}
case class LineCount(line: String, count: Int) {
override def toString: String = line + " " + count
}
- MapPartition
import java.lang
import org.apache.flink.api.common.functions.MapPartitionFunction
import org.apache.flink.api.scala.{ExecutionEnvironment, _}
import org.apache.flink.util.Collector
object MapPartitionFunction01 {
def main(args: Array[String]): Unit = {
//创建运行时环境
val env = ExecutionEnvironment.getExecutionEnvironment
env.getConfig.disableSysoutLogging()
//创建模拟测试数据
val text = env.fromElements("flink hadoop", "spark hive").flatMap(_.split("\\s+"))
//以partition为粒度进行count计算
//将mapPartition中的方法安装partition作用于DataSet,产生另一个DataSet
//比较适合没有分组的数据,如果是需要转换单个的元素,更适合用map方法
//MapPartitionFunction[String, Long] String为输入元素的类型,Long为返回元素的类型,因为是计数,所以是Long
val text2 = text.mapPartition(new MapPartitionFunction[String, Long]() {
override def mapPartition(iterable: lang.Iterable[String], collector: Collector[Long]): Unit = {
var count = 0
val iterator = iterable.iterator()
while (iterator.hasNext) {
iterator.next()
count += 1
}
collector.collect(count)
}
})
text2.print()
//全体数据加上一个前缀
val text3 = text.mapPartition(new MapPartitionFunction[String, String] {
override def mapPartition(values: lang.Iterable[String], out: Collector[String]): Unit = {
val iterator = values.iterator()
while (iterator.hasNext) {
var str = iterator.next()
str = "prefix-" + str
out.collect(str)
}
}
})
text3.print()
}
}
- Reduce
import org.apache.flink.api.common.functions.ReduceFunction
import org.apache.flink.api.scala._
import org.apache.flink.api.scala.ExecutionEnvironment
object ReduceFunction01 {
def main(args: Array[String]): Unit = {
//创建运行时环境
val env = ExecutionEnvironment.getExecutionEnvironment
env.getConfig.disableSysoutLogging()
//创建模拟测试数据
val text = env.fromElements("flink hadoop", "spark hive hadoop flink", "flink").flatMap(_.split("\\s+"))
//reduce()将输入的数据通过自定义的处理逻辑,返回一个结果
val text2 = text.reduce((str1, str2) => str1.concat(str2))
text2.print()
println("------------------------------------------------")
val text3 = text.reduce(new ReduceFunction[String]{
override def reduce(value1: String, value2: String): String = {
println("The first value to combine:" + value1)
println("The second value to combine:" + value2)
value1.concat(value2)
}
})
text3.print()
}
}
- ReduceGroup
import java.lang
import org.apache.flink.api.common.functions.GroupReduceFunction
import org.apache.flink.api.scala._
import org.apache.flink.api.scala.ExecutionEnvironment
import org.apache.flink.util.Collector
object ReduceGroupFunction01 {
def main(args: Array[String]): Unit = {
//创建运行时环境
val env = ExecutionEnvironment.getExecutionEnvironment
env.getConfig.disableSysoutLogging()
//创建模拟测试数据
val text = env.fromElements("flink hadoop", "spark hive hadoop flink", "flink").flatMap(_.split("\\s+"))
//一般是先分组,然后根据每组的数据进行计算
val text2 = text.map((_, 1)).groupBy(0).reduceGroup(new GroupReduceFunction[(String, Int), (String, Int)] {
override def reduce(values: lang.Iterable[(String, Int)], out: Collector[(String, Int)]): Unit = {
val iterator = values.iterator()
var word = ""
var cnt = 0
while (iterator.hasNext) {
val item = iterator.next()
word = item._1
cnt += item._2
}
out.collect((word, cnt))
}
})
text2.print()
}
}
- Join
import org.apache.flink.api.scala.{ExecutionEnvironment, _}
object JoinFunction01 {
def main(args: Array[String]): Unit = {
val env = ExecutionEnvironment.getExecutionEnvironment
env.getConfig.disableSysoutLogging()
val stuDataSet = env.fromElements(
(1, "张三", "男", 21),
(2, "彭霞", "女", 18),
(3, "李四", "男", 20),
(4, "李莉", "女", 23),
(5, "倩倩", "女", 21)
)
val scoreDataSet = env.fromElements(
(1, 90),
(2, 84),
(3, 80),
(4, 92),
(5, 87)
)
//where是指出左边DataSet的Join列,equalTo是指出右边DataSet的Join列
val res = stuDataSet.join(scoreDataSet)
.where(0)
.equalTo(0)
res.print()
}
}