object MRInSpark {
/**
* 求最大值最小值一直是Hadoop的经典案例,我们用Spark来实现一下,
* 借此感受一下spark中mr的思想和实现方式
*/
def maxMin = {
val sconf = new SparkConf().setAppName("avgTest").setMaster("local[2]")
val sc = new SparkContext(sconf)
val foo = sc.parallelize(List(1, 6, 4, 22))
val max = foo.reduce((a, b) => Math.max(a, b))
val min = foo.reduce((a, b) => Math.min(a, b))
print(s"max=$max, min=$min")
}
/**
* 平均值问题
* 求每个key对应的平均值是常见的案例,
* 在spark中处理类似问题常常会用到combineByKey这个函数,
* 详细介绍请google一下用法,下面看代码: *
*/
def avg = {
val sconf = new SparkConf().setAppName("avgTest").setMaster("local[2]")
val sc = new SparkContext(sconf)
val foo = sc.parallelize(List(("a", 1), ("a", 3), ("b", 2), ("b", 8)))
val result = foo.combineByKey(
//按照key进行分区内合并,v表示value,1表示当前的key出现的次数
(v) => (v, 1),
//acc为之前创建的元组,如果出现同一个key的value要进行累加
(acc: (Int, Int), v) => (acc._1 + v, acc._2 + 1),
//将不同分区间的的数据进行合并
(acc1: (Int, Int), acc2: (Int, Int)) => (acc1._1 + acc2._1, acc1._2 + acc2._2)
).map { case (k, v) => (k, v._1 / v._2.toDouble) }
result.collect().foreach(println)
}
def avgTwo = {
val sconf = new SparkConf().setAppName("avgTest").setMaster("local[2]")
val sc = new SparkContext(sconf)
val foo = sc.parallelize(List(("a", 1), ("a", 3), ("b", 2), ("b", 8)))
val result = foo.groupByKey().map { case (k, vs) => (k, vs.toList.sum / vs.size) }
result.collect().foreach(println)
}
/**
* Top n问题同样也是hadoop种体现mr思想的经典案例,那么在spark中如何方便快捷的解决呢:
*/
def topn = {
val sconf = new SparkConf().setAppName("avgTest").setMaster("local[2]")
val sc = new SparkContext(sconf)
val foo = sc.parallelize(List(("a", 1), ("a", 3), ("a", 2), ("b", 1), ("b", 4), ("a", 4), ("b", 2)))
val groupSort = foo.groupByKey().map {
case (k, values) =>
//n此时取值为2
val sortValues = values.toList.sortWith(_ > _).take(2)
(k, sortValues)
}
groupSort.flatMap { case (k, vs) => vs.map(k -> _) }.foreach(println)
}
def main(args: Array[String]): Unit = {
avgMine
}
}
hadoop常用算法在spark中实现
最新推荐文章于 2022-07-08 11:31:03 发布