DataSource类型算子
val environment: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
// 支持多种Collection的具体类型
val datasource1: DataSet[String] = environment.fromCollection(Array("a", "b", "c", "d"))
// 支持Tuple,自定义对象等复合形式
val datasource2: DataSet[String] = environment.fromElements("a", "b", "c", "d")
// 基于迭代的sequence的dataset
val datasource3: DataSet[Long] = environment.generateSequence(1, 20)
// 基于本地文件的读取
val datasource4: DataSet[String] = environment.readTextFile("word.txt")
// 读取hdfs文件
val datasource5: DataSet[String] = environment.readTextFile("hdfs://node01:9000/word.txt")
// 读取csv文件,可以直接指定泛型
class WordCount(word:String,num:Int)
val datasource6: DataSet[WordCount] = environment.readCsvFile[WordCount]("word.csv")
// 读取压缩文件,flink会自动识别出压缩类型,使用对应的方式进行解压
val datasource7 = environment.readTextFile("word.tar.gz")
flink批处理transformation算子
ransformation | 说明 |
---|---|
map | 将DataSet中的每一个元素转换为另外一个元素 |
flatMap | 将DataSet中的每一个元素转换为0…n个元素 |
mapPartition | 将一个分区中的元素转换为另一个元素 |
filter | 过滤出来一些符合条件的元素 |
reduce | 可以对一个dataset或者一个group来进行聚合计算,最终聚合成一个元素 |
reduceGroup | 将一个dataset或者一个group聚合成一个或多个元素 |
aggregate | 按照内置的方式来进行聚合。例如:SUM/MIN/MAX… |
distinct | 去重 |
join | 将两个DataSet按照一定条件连接到一起,形成新的DataSet |
union | 将两个DataSet取并集,并不会去重 |
rebalance | 让每个分区的数据均匀分布,避免数据倾斜 |
partitionByHash | 按照指定的key进行hash分区 |
sortPartition | 指定字段对分区中的数据进行排序 |
sum算子
def main(args: Array[String]): Unit = {
val environment = ExecutionEnvironment.getExecutionEnvironment
val datasource: DataSet[String] = environment.fromCollection(Array("flink spark hive", "hive spark hbase", "spark kafka"))
datasource.flatMap(_.split(" "))
.filter(_.length>1)
.map((_,1))
.groupBy(0)
.sum(1)
.print()
}
reduce算子
def main(args: Array[String]): Unit = {
val environment = ExecutionEnvironment.getExecutionEnvironment
val datasource: DataSet[String] = environment.fromCollection(Array("flink spark hive", "hive spark hbase", "spark kafka"))
datasource.flatMap(_.split(" "))
.filter(_.length>1)
.map((_,1))
.groupBy(0)
.reduce((a,b)=>(b._1,a._2+b._2))
.print()
}
aggregate
def main(args: Array[String]): Unit = {
val environment = ExecutionEnvironment.getExecutionEnvironment
val datasource: DataSet[String] = environment.fromCollection(Array("flink spark hive", "hive spark hbase", "spark kafka"))
datasource.flatMap(_.split(" "))
.filter(_.length>1)
.map((_,1))
.groupBy(0)
.aggregate(Aggregations.SUM,1)
.print()
}
累加器accumulator
def main(args: Array[String]): Unit = {
val environment = ExecutionEnvironment.getExecutionEnvironment
val datasource: DataSet[String] = environment.fromCollection(Array("flink spark hive", "hive spark hbase", "spark kafka"))
val res: DataSet[String] = datasource.flatMap(_.split(" "))
.filter(_.length > 1)
.map(new RichMapFunction[String, String] {
// 累加器变量
private val intCounter = new IntCounter()
override def open(parameters: Configuration): Unit = {
// 注册累加器,param1:累加器名称,param2累加器变量
getRuntimeContext.addAccumulator("wordCounter", intCounter)
}
override def map(in: String): String = {
// 累加器自增
intCounter.add(1)
in
}
})
res.writeAsText("data/wordCount",FileSystem.WriteMode.OVERWRITE)
val result: JobExecutionResult = environment.execute("WordCount")
println(result.getAccumulatorResult("wordCounter"))
}
广播变量broadcast
def main(args: Array[String]): Unit = {
val env = ExecutionEnvironment.getExecutionEnvironment
val score = env.fromCollection(List(("张三", 100),("李四", 500),("王五",1000)))
val student = env.fromCollection(List(("001", "张三"), ("002", "李四"), ("003", "王五")))
student.map(new RichMapFunction[(String,String),(String,String,Int)] {
var list: List[(String, Int)] = null
override def map(in: (String, String)): (String, String, Int) = {
val sname = in._2
val tuples = list.filter((x: (String, Int)) => sname == x._1)
(in._1,in._2,tuples(0)._2)
}
override def open(parameters: Configuration): Unit = {
import scala.collection.JavaConverters._
list = getRuntimeContext.getBroadcastVariable[(String,Int)]("score").asScala.toList
}
}).withBroadcastSet(score,"score").print()
}
分布式缓存文件
def main(args: Array[String]): Unit = {
// env
val environment = ExecutionEnvironment.getExecutionEnvironment
// load list
val list: DataSet[String] = environment.fromCollection(List("a", "b", "c", "d"))
// 注册文件 param1:文件路径,param2缓存名称
environment.registerCachedFile("D://work//train_data//word.txt","word")
// map open获取文件
val result = list.map(new RichMapFunction[String, String] {
override def open(parameters: Configuration): Unit = {
val myFile = getRuntimeContext.getDistributedCache.getFile("word")
val lines = FileUtils.readLines(myFile)
val it = lines.iterator()
while (it.hasNext) {
val line = it.next();
println("line:" + line)
}
}
override def map(in: String): String = {
in
}
})
result.print()
}