1.1
import org.apache.spark.{SparkConf, SparkContext}
object test {
def main(args: Array[String]): Unit = {
val conf =new SparkConf().setMaster("local").setAppName("Appname")
val sc=new SparkContext(conf)
// val data=Array(1,2,3,4,5)
// val disData=sc.parallelize(data,4)
// println(disData.partitions.size)
val rdd1=sc.parallelize(List(('x',100),('x',100),('c',105),('f',120)))
val rdd2=sc.parallelize(List(('x',109),('c',105),('c',105),('o',103),('c',150)))
var rdd3=sc.parallelize(rdd1.union(rdd2).collect())
println(rdd3.collect().toList) //合并
println(rdd1.cartesian(rdd2).collect().toList) //笛卡尔积
println(rdd3.filter(_._2 <=100).collect().toList) //过滤大于等于100的元素
println(rdd3.distinct().collect().toList) //去重
println(rdd1.subtract(rdd2).collect().toList) //求补集
println(rdd1.join(rdd2).collect().mkString("\n")) // 内连接结果打印
println(rdd1.rightOuterJoin(rdd2).collect().mkString("\n")) // 右外连接结果打印
println(rdd1.leftOuterJoin(rdd2).collect().mkString("\n")) // 左外连接结果打印
println(rdd1.keys.collect().toList) // 打印RDD的键
println(rdd1.values.collect().toList) // 打印RDD的值
}
}
1.2
import org.apache.spark.{SparkConf, SparkContext}
object test {
def main(args: Array[String]): Unit = {
val conf =new SparkConf().setMaster("local").setAppName("Appname")
val sc=new SparkContext(conf)
val data=Array(1,2,3,4,5)
val disData=sc.parallelize(data,4)
println(disData.partitions.size)
val rdd1=sc.parallelize(List(('x',100),('x',100),('c',105),('f',120)))
val rdd2=sc.parallelize(List(('x',109),('c',105),('c',105),('o',103),('c',150)))
var rdd3=sc.parallelize(rdd1.union(rdd2).collect())
val cb_test=rdd1.combineByKey(
count => (count,1),
(acc:(Int,Int),count)=> (acc._1+count,acc._2 +1),
(acc1:(Int,Int),acc2:(Int,Int)) => (acc1._1+acc2._1,acc1._2+acc2._2))
println(cb_test.map(x => (x._1,x._2._1.toDouble/x._2._2)).collect().toList)
}
}
1.3.使用Spark完成单词去重
13326293050 81
13626594101 50
13326435696 30
13926265119 40
13326564191 2106
13626544191 1432
13919199499 300
import org.apache.spark.{SparkConf, SparkContext}
object two {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local").setAppName("Appname")
val sc = new SparkContext(conf)
val test = sc.textFile("/Users/Administrator/Desktop/text02.txt")
println(test.collect().toList)
val data = test.flatMap(_.split(" "))
println(data.collect().toList)
val data1=data.distinct()
println(data1.collect().toList)
}
}
1.4使用Spark统计133 136 139开头的总流量
13326293050 81
13626594101 50
13326435696 30
13926265119 40
13326564191 2106
13626544191 1432
13919199499 300