准备数据
cat data.txt
hadoop spark
hadoop sweet
bomb you have a good book
do you know ha hadoop
四种方式代码
import java.lang
import org.apache.flink.api.common.functions.{GroupReduceFunction, ReduceFunction, RichGroupReduceFunction, RichReduceFunction}
import org.apache.flink.api.scala.ExecutionEnvironment
import org.apache.flink.api.scala._
import org.apache.flink.configuration.Configuration
import org.apache.flink.util.Collector
import scala.collection.JavaConversions._
def main(args: Array[String]): Unit = {
val env = ExecutionEnvironment.getExecutionEnvironment
val path = this.getClass.getResource("/data.txt").getPath
val text = env.readTextFile(path).setParallelism(2)
val group_ds = text.flatMap(_.split(" ")).map((_, 1))
.groupBy(0)
//方式1 reduce算子
group_ds.reduce((x,y) => (x._1, x._2 + y._2))
.print()
//方式2 自定义reduce function
group_ds.reduce(new RichReduceFunction[(String,Int)]{
override def open(parameters: Configuration): Unit = super.open(parameters)
override def reduce(value1: (String, Int), value2: (String, Int)): (String, Int) = {
(value1._1, value1._2 + value2._2)
}
})
//方式3 reduceGroup算子
group_ds.reduceGroup((input:Iterator[(String,Int)], out: Collector[(String, Int)])=> {
val output_value = input.reduce((x,y)=> (x._1, x._2 + y._2))
out.collect(output_value)
}).print()
//方式4 自定义reduceGroup function
group_ds.reduceGroup(new RichGroupReduceFunction[(String,Int), (String,Int)] {
override def open(parameters: Configuration): Unit = super.open(parameters)
override def reduce(values: lang.Iterable[(String, Int)], out: Collector[(String, Int)]): Unit = {
val output_value = values.iterator().reduce((x,y)=> (x._1, x._2 + y._2))
out.collect(output_value)
}
}).print()
}
代码运行结果
(know,1)
(sweet,1)
(a,1)
(bomb,1)
(hadoop,3)
(you,2)
(book,1)
(ha,1)
(do,1)
(good,1)
(have,1)
(spark,1)
使用说明
方式1和方式3,是最常用的方式,方式2和4提供了更为丰富更为强大的保障,比如在计算过程中需要连接数据库的时候,我们可以在自定义的function中的open方法里创建各种数据库连接,在close方法中释放连接。所以我们根据实际需要来选择合适的方式。