package spark.examples.avg
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.SparkContext._
object SparkAvg {
def main(args: Array[String]) {
System.setProperty("hadoop.home.dir", "E:\\devsoftware\\hadoop-2.5.2\\hadoop-2.5.2");
val conf = new SparkConf()
conf.setAppName("SparkAvg")
conf.setMaster("local[3]")
conf.set("spark.shuffle.manager", "sort");
val sc = new SparkContext(conf)
val a = sc.parallelize(1 to 9, 3)
def func(iter: Iterator[Int]): Iterator[(Int, Int)] = {
var res = List[(Int, Int)]()
var count = 0
var sum: Int = 0
while (iter.hasNext) {
count += 1
sum += iter.next
}
res = (count, sum) :: res
res.iterator
}
var sum = 0;
var count = 0;
val entries = a.mapPartitions(func).collect;
for (entry <- entries) {
count += entry._1
sum += entry._2
}
println("count: " + count + "," + "sum: " + sum)
sc.stop
}
}
单词词频topK
package spark.examples.avg
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.SparkContext._
object SparkTopK {
def main(args : Array[String]) {
val conf = new SparkConf().setAppName("SparkTopP").setMaster("local")
val sc = new SparkContext(conf)
val rdd = sc.parallelize(List(100,32,67,17,7,71, 38))
val results = rdd.top(4);
results.foreach(println)
///单词计数TopK
val words = sc.parallelize(List("This is a book", "That is a desk", "what is That"))
val results2 = words.flatMap(_.split(" ")).map(x => (x, 1)).reduceByKey(_ + _).map(x => (x._2, x._1)).sortByKey(true, 1).top(3)
results2.foreach(println)
sc.stop
}
}