/**
* 1.处理average.txt,计算出第二列数据的均值
* 1 16
* 2 74
*/
package com.dasenlin.spark
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
object Average {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("average")
val sc = new SparkContext(conf)
val data = sc.textFile("D://scaladata/average.txt", 2)
val res = data.map{ _.split(" ")(1).toInt}
println(res.sum()/res.count())
}
}
/**
* 2.处理MaxMin.txt,找出男性身高最大值,结果是:191 // 1 M 174
*/
object MaxMin{
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("average")
val sc = new SparkContext(conf)
val data = sc.textFile("D://scaladata/MaxMin.txt", 2)
val res=data.filter { _.split(" ")(1).equals("M") }.map { _.split(" ")(2).toInt }
println(res.max())
}
}
/**
*3.求中位数(奇数个数)
* 1 20 8 2 5 11 29 10
* 7 4 45 6 23 17 19
*/
object Median {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("median")
val sc = new SparkContext(conf)
val data = sc.textFile("D://scaladata/median.txt", 2)
val numcount = data.flatMap { _.split(" ") }.count
val res = data.flatMap { _.split(" ") }.sortBy{_.toInt }.take((numcount.toInt+1)/2).last
println(res)
}
}
/**
*4.处理MaxMin.txt文件,得到男性身高的最大值(191)
*1 M 174
*2 F 165
*/
object MaxMin {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("maxmin")
val sc = new SparkContext(conf)
//读文件
val data = sc.textFile("D://scaladata/MaxMin.txt", 2);
val res = data.filter { _.split(" ")(1).equals("M")}.map { _.split(" ")(2).toInt}
println(res.max())
}
}
/**
*5.处理MaxMin.txt文件,得到男性身高最大值的那一行数据 8 M 191
*1 M 174
*2 F 165
*/
object Line {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("line")
val sc = new SparkContext(conf)
//读文件
val data = sc.textFile("D://scaladata/MaxMin.txt", 2);
val res = data.filter { _.split(" ")(1).equals("M")}.map { _.split(" ")}.sortBy(x => -x(2).toInt).map { _.mkString(",") }.take(1)
println(res)
}
}
/**
*6.处理topk.txt文件,得到频次最高的前三项单词 比如:(hive,10)(hadoop,8)(world,4)
*hello world bye world
*hello hadoop bye hadoop
*/
object Top3 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("wordcount")
val sc = new SparkContext(conf)
val data = sc.textFile("D://scaladata/topk.txt", 2);
/*val res = data.flatMap { _.split(" ") }.map{(_,1)}.reduceByKey(_+_).sortBy{x=> -x._2}.take(3)
res.foreach(println)
*/
val res2 = data.flatMap { _.split(" ") }.map{(_,1)}.reduceByKey(_+_).top(3)(Ordering.by { x=> -x._2 })
res2.foreach(println)
}
}
/**
*7.二次排序
*aa 12
*bb 32
*/
package com.dasenlin.ssort
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
object Driver {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("ssort")
val sc = new SparkContext(conf)
val data=sc.textFile("D://scaladata/ssort.txt", 2)
val r1 = data.map { x =>
(new Ssort(x.split(" ")(0),x.split(" ")(1).toInt),x)
}
val r2=r1.sortByKey(true).map(_._2)
r2.foreach(println)
}
}
package com.dasenlin.ssort
class Ssort(val col1:String,val col2:Int) extends Ordered[Ssort] with Serializable {
def compare(that: Ssort): Int = {
val result=this.col1.compareTo(that.col1)
if(result==0){
that.col2.compare(this.col2)
}else{
result
}
}
}