Flink批处理Transformation
map
将DataSet中的每一个元素转换为另外一个元素
package com.ccj.pxj.heima.tran
import org.apache.flink.api.scala._
object MapTrans {
def main(args: Array[String]): Unit = {
//1. 获取 ExecutionEnvironment 运行环境
val env: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
//2. 使用 fromCollection 构建数据源 List("1,张三", "2,李四", "3,王五", "4,赵六")
val datsSet: DataSet[String] = env.fromCollection(List("1,张三", "2,李四", "3,王五", "4,赵六"))
//4. 使用 map 操作执行转换
val data: DataSet[User] = datsSet.map(x => {
val dats: Array[String] = x.split(",")
User(dats(0).toInt, dats(1))
})
//5. 打印测试
data.print()
}
}
//3. 创建一个 User 样例类
case class User(id:Int,name:String)
flatMap
package com.ccj.pxj.heima.tran
import org.apache.flink.api.scala._
object FlatmapTrans {
def main(args: Array[String]): Unit = {
val env: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
val datas: DataSet[String] = env.fromCollection(List(
"张三,中国,江西省,南昌市",
"李四,中国,河北省,石家庄市",
"Tom,America,NewYork,Manhattan"
))
val data: DataSet[Product with Serializable] = datas.flatMap(x => {
val ss: Array[String] = x.split(",")
List(
(ss(0), ss(1)),
(ss(0), ss(1), ss(2)),
(ss(0), ss(1), ss(2), ss(2)))
})
data.print()
}
}
mapPartition
package com.ccj.pxj.heima.tran
import org.apache.flink.api.scala._
object MapPartitionTrans {
def main(args: Array[String]): Unit = {
val env: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
val datas: DataSet[String] = env.fromCollection(List("1,张三", "2,李四", "3,王五", "4,赵六"))
val data: DataSet[User] = datas.mapPartition(
// 开启redis或者mysql的链接
its => {
its.map(x => {
val Arr = x.split(",")
User(Arr(0).toInt, Arr(1))
})
// 关闭redis或者mysql的链接
})
data.print()
}
}
case class User(id:Int,name:String)
fiter
package com.ccj.pxj.heima.tran
import org.apache.flink.api.scala._
object FilterTrans {
def main(args: Array[String]): Unit = {
val env: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
val datas: DataSet[String] = env.fromCollection(List("hadoop", "hive", "spark", "flink"))
val dasst = datas.filter(_.contains("h"))
dasst.print()
}
}
reducegroup
package com.ccj.pxj.heima.tran
import org.apache.flink.api.scala._
object reduceGroup {
def main(args: Array[String]): Unit = {
val env: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
val datas: DataSet[(String, Int)] = env.fromCollection(List(("java", 1), ("java", 1), ("scala", 1), ("scala", 1), ("flink", 1), ("pxj", 1)))
val data: DataSet[(String, Int)] = datas.groupBy(_._1).reduceGroup(its => its.reduce(
(x, y) => (x._1, x._2 + y._2)
))
data.print()
}
}
aggregate
package com.ccj.pxj.heima.tran
/**
* 1. 获取 ExecutionEnvironment 运行环境
* 2. 使用 fromCollection 构建数据源
* 3. 使用 groupBy 按照单词进行分组
* 4. 使用 aggregate 对每个分组进行 SUM 统计
* 5. 打印测试
*/
import org.apache.flink.api.java.aggregation.Aggregations
import org.apache.flink.api.scala._
object AggregateTran{
def main(args: Array[String]): Unit = {
val env: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
val datas: DataSet[(String, Int)] = env.fromCollection(List(("java", 1), ("java", 1), ("scala", 1), ("scala", 1), ("flink", 1), ("pxj", 1)))
val aggregateData: AggregateDataSet[(String, Int)] = datas.groupBy(0).aggregate(Aggregations.SUM, 1)
aggregateData.print()
}
}
(pxj,1)
(java,2)
(flink,1)
(scala,2)
Process finished with exit code 0
distinct
package com.ccj.pxj.heima.tran
import org.apache.flink.api.scala._
/**
* 1. 获取 ExecutionEnvironment 运行环境
* 2. 使用 fromCollection 构建数据源
* 3. 使用 distinct 指定按照哪个字段来进行去重
* 4. 打印测试
*
* */
object DistinctTran {
def main(args: Array[String]): Unit = {
val env: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
val datas: DataSet[(String, Int)] = env.fromCollection(List(("java", 1), ("java", 1), ("scala", 1)))
datas.distinct(_._1).print()
}
}
join
package com.ccj.pxj.heima.tran
import org.apache.flink.api.scala._
object JoinTran {
def main(args: Array[String]): Unit = {
val env: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
val subjectData: DataSet[Subject] = env.readCsvFile("./data/subject.csv")
val scoreData: DataSet[Score] = env.readCsvFile("./data/score.csv")
val joinData: JoinDataSet[Score, Subject] = scoreData.join(subjectData).where(2).equalTo(0)
joinData.print()
}
}
// 学科Subject(学科ID、学科名字)
case class Subject(id:Int, name:String)
// 成绩Score(唯一ID、学生姓名、学科ID、分数)
case class Score(id:Int, name:String, subjectId:Int, score:Double)
(Score(3,张三,3,89.0),Subject(3,英语))
(Score(9,李四,3,65.0),Subject(3,英语))
(Score(27,小七,3,78.0),Subject(3,英语))
(Score(21,赵六,3,65.0),Subject(3,英语))
(Score(15,王五,3,58.0),Subject(3,英语))
(Score(1,张三,1,98.0),Subject(1,语文))
(Score(11,李四,5,70.0),Subject(5,化学))
(Score(23,赵六,5,70.0),Subject(5,化学))
(Score(5,张三,5,78.0),Subject(5,化学))
(Score(6,张三,6,70.0),Subject(6,生物))
(Score(19,赵六,1,77.5),Subject(1,语文))
(Score(12,李四,6,78.0),Subject(6,生物))
(Score(13,王五,1,70.0),Subject(1,语文))
(Score(24,赵六,6,78.0),Subject(6,生物))
(Score(25,小七,1,78.0),Subject(1,语文))
(Score(29,小七,5,65.0),Subject(5,化学))
(Score(30,小七,6,78.0),Subject(6,生物))
(Score(17,王五,5,78.0),Subject(5,化学))
(Score(18,王五,6,98.0),Subject(6,生物))
(Score(7,李四,1,78.0),Subject(1,语文))
(Score(10,李四,4,78.0),Subject(4,物理))
(Score(22,赵六,4,78.0),Subject(4,物理))
(Score(28,小七,4,58.0),Subject(4,物理))
(Score(4,张三,4,65.0),Subject(4,物理))
(Score(16,王五,4,65.0),Subject(4,物理))
(Score(2,张三,2,77.5),Subject(2,数学))
(Score(20,赵六,2,89.0),Subject(2,数学))
(Score(14,王五,2,78.0),Subject(2,数学))
(Score(26,小七,2,70.0),Subject(2,数学))
(Score(8,李四,2,58.0),Subject(2,数学))
Process finished with exit code 0
union
package com.ccj.pxj.heima.tran
import org.apache.flink.api.scala._
/**
* 1. 构建批处理运行环境
* 2. 使用 fromCollection 创建两个数据源
* 3. 使用 union 将两个数据源关联在一起
* 4. 打印测试
* */
object UnionTran {
def main(args: Array[String]): Unit = {
val env: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
val data1: DataSet[String] = env.fromCollection(List("hadoop", "hive", "flume"))
val data2: DataSet[String] = env.fromCollection(List("hadoop", "hive", "spark"))
data1.union(data2).print()
}
}
rebalance
Flink也会产生 数据倾斜
的时候,例如:当前的数据量有10亿条,在处理过程就有可能发生如下状况:
rebalance 会使用轮询的方式将数据均匀打散,这是处理数据倾斜最好的选择。
package com.ccj.pxj.heima.tran
import org.apache.flink.api.common.functions.RichMapFunction
import org.apache.flink.api.scala._
/*
*1. 构建批处理运行环境
2. 使用 env.generateSequence 创建0-100的并行数据
3. 使用 fiter 过滤出来 大于8 的数字
4. 使用map操作传入 RichMapFunction ,将当前子任务的ID和数字构建成一个元组
*/
object RebanlanceTrans {
def main(args: Array[String]): Unit = {
val env: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
val generateData: DataSet[Long] = env.generateSequence(0, 100)
val FliterData: DataSet[Long] = generateData.filter(_ > 8)
val FliterData1: DataSet[Long] = generateData.filter(_ > 8).rebalance()
val dataFliter1: DataSet[(Long, Long)] = FliterData.map(new RichMapFunction[Long, (Long, Long)] {
override def map(in: Long): (Long, Long) = {
(getRuntimeContext.getIndexOfThisSubtask, in)
}
})
dataFliter1.print()
println("-------------------------------")
val dataFilter2: DataSet[(Long, Long)] = FliterData1.map(new RichMapFunction[Long, (Long, Long)] {
override def map(in: Long): (Long, Long) = {
(getRuntimeContext.getIndexOfThisSubtask,in)
}
})
dataFilter2.print()
}
}
(0,61)
(0,62)
(0,63)
(0,64)
(0,65)
(0,66)
(0,67)
(0,68)
(1,77)
(1,78)
(1,79)
(1,80)
(1,81)
(1,82)
(1,83)
(1,84)
(2,36)
(2,37)
(2,38)
(2,39)
(2,40)
(2,41)
(2,42)
(2,43)
(2,44)
(3,93)
(3,94)
(3,95)
(3,96)
(3,97)
(3,98)
(3,99)
(3,100)
(4,53)
(4,54)
(4,55)
(4,56)
(4,57)
(4,58)
(4,59)
(4,60)
(5,27)
(5,28)
(5,29)
(5,30)
(5,31)
(5,32)
(5,33)
(5,34)
(5,35)
(7,9)
(7,10)
(7,11)
(7,12)
(7,13)
(7,14)
(7,15)
(7,16)
(7,17)
(8,45)
(8,46)
(8,47)
(8,48)
(8,49)
(8,50)
(8,51)
(8,52)
(9,85)
(9,86)
(9,87)
(9,88)
(9,89)
(9,90)
(9,91)
(9,92)
(10,69)
(10,70)
(10,71)
(10,72)
(10,73)
(10,74)
(10,75)
(10,76)
(11,18)
(11,19)
(11,20)
(11,21)
(11,22)
(11,23)
(11,24)
(11,25)
(11,26)
-------------------------------
(0,72)
(0,60)
(0,11)
(0,29)
(0,94)
(0,82)
(0,19)
(1,85)
(1,73)
(1,36)
(1,12)
(1,30)
(1,95)
(1,83)
(1,20)
(2,86)
(2,74)
(2,37)
(2,13)
(2,61)
(2,31)
(2,96)
(2,84)
(2,21)
(3,87)
(3,75)
(3,38)
(3,14)
(3,62)
(3,32)
(3,97)
(3,45)
(3,22)
(4,88)
(4,76)
(4,39)
(4,15)
(4,63)
(4,33)
(4,98)
(4,46)
(4,23)
(5,89)
(5,53)
(5,40)
(5,16)
(5,64)
(5,34)
(5,99)
(5,47)
(5,24)
(6,90)
(6,54)
(6,41)
(6,17)
(6,65)
(6,35)
(6,100)
(6,48)
(6,25)
(7,91)
(7,55)
(7,42)
(7,66)
(7,77)
(7,49)
(7,26)
(8,92)
(8,56)
(8,43)
(8,67)
(8,78)
(8,50)
(9,69)
(9,57)
(9,44)
(9,68)
(9,79)
(9,51)
(10,70)
(10,58)
(10,9)
(10,27)
(10,80)
(10,52)
(11,71)
(11,59)
(11,10)
(11,28)
(11,93)
(11,81)
(11,18)
hashPartition
package com.ccj.pxj.heima.tran
import org.apache.flink.api.java.operators.DataSink
import org.apache.flink.api.scala._
/**
* 1. 构建批处理运行环境
* 2. 设置并行度为 2
* 3. 使用 fromCollection 构建测试数据集
* 4. 使用 partitionByHash 按照字符串的hash进行分区
* 5. 调用 writeAsText 写入文件到 data/parition_output 目录中
* 6. 打印测试
*/
object HashPartitionTran {
def main(args: Array[String]): Unit = {
val env: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
env.setParallelism(2)
val datas: DataSet[Int] = env.fromCollection(List(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2))
val datapart = datas.partitionByHash(_.toString)
val partitionDataSet: DataSink[Int] = datapart.writeAsText("./data/parition_output")
datapart.print()
}
}
sortPartition
package com.ccj.pxj.heima.tran
import org.apache.flink.api.common.operators.Order
import org.apache.flink.api.scala._
/*
1. 构建批处理运行环境
2. 使用 fromCollection 构建测试数据集
3. 设置数据集的并行度为 2
4. 使用 sortPartition 按照字符串进行降序排序
5. 调用 writeAsText 写入文件到 data/sort_output 目录中
6. 启动执行
*/
object sortPartition {
def main(args: Array[String]): Unit = {
val env: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
env.setParallelism(2)
val datas: DataSet[String] = env.fromCollection(List("hadoop", "hadoop", "hadoop", "hive", "hive", "spark", "spark", "flink"))
val dastass: DataSet[String] = datas.sortPartition(_.toString, Order.DESCENDING)
dastass.writeAsText("./data/sort_output2")
datas.print()
// env.execute("pxj")
}
}
作者:pxj
日期:2021-07-25 23:01:58