第三章
1,parallelize()
创建RDD及查看分区个数
package test
import org.apache.spark.{SparkConf, SparkContext}
object d {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("worldcount").setMaster("local")
val sc = new SparkContext(conf)
val data = Array(1, 2, 3, 4, 5)
//val distData = sc.parallelize(data)显示默认分区个数
val distData = sc.parallelize(data, 4)//设置分区为4
print(distData.partitions.size)
}
}
2,makeRDD()
创建RDD并查看分区值
//使用makeRDD()方法创建RDD并查看各分区的值
val seq=Seq((1,seq("iteblogs.com","sparkhost1.com")),
(3,Seq("iteblog.com","sparkhost2.com")),
(2,Seq("iteblog.com","sparkhost3.com")))
//使用makeRDD创建RDD
val iteblog=sc.makeRDD(seq)
//查看RDD的值
iteblog.collect.foreach(print)
3,map()方法转换数据
可以对RDD中的每一个数据元素通过某种函数进行转换并返回新的RDD
//创建RDD
val disData = sc.parallelize(List(1,3,45,3,76))
//map()方法求平均值
val sq_list=disData.map(x => x*x)
println(sq_list)
4,sortBy()方法进行排序
//创建RDD
val data = sc.parallelize(List((1,3),(45,3),(7,6)))
//使用sortBy()方法对元祖的第二个值进行降序排序,分区个数设置为1
val sort_data=data.sortBy(x => x._2,false,1)
println(sort_data)
P58 获取上半年实际薪资排名前三的员工信息
package test
import org.apache.spark.{SparkConf, SparkContext}
object a {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("worldcount").setMaster("local")
val sc=new SparkContext(conf)
val first_half = sc.textFile("C:\\Users\\Administrator\\Desktop\\Employee_salary_first_half.csv")
val drop_first =first_half.mapPartitionsWithIndex((ix,it) => {
if (ix==0 ) it drop(1)
it
})
val split_first = drop_first.map(line => {val data = line.split(",");
(data(1),data(6).toInt)
})
val sort_first= split_first.sortBy(x => x._2,false)
sort_first.take(3)
}
}
4,collect()方法查询数据
filter()方法进行过滤
package test
import org.apache.spark.{SparkConf, SparkContext}
object b {
def main(args: Array[String]): Unit = {
val conf =new SparkConf().setMaster("local").setAppName("PartialFumction")
val sc=new SparkContext(conf)
val rdd1 = sc.parallelize(List(('a',1),('b',2),('c',3)))
rdd1.filter(_._2>1).collect.foreach(println)
rdd1.filter(x => x._2 > 1).collect.foreach(println)
}
}
Distinct()方法去重
package test
import org.apache.spark.{SparkConf, SparkContext}
object c {
def main(args: Array[String]): Unit = {
val conf =new SparkConf().setMaster("local").setAppName("PartialFumction")
val sc=new SparkContext(conf)
val rdd3=sc.makeRDD(List(1,100,200,300,100))
rdd3.filter(x => x>99).collect()
rdd3.distinct().collect().foreach(println)
}
}
Intersection方法
package test
import org.apache.spark.{SparkConf, SparkContext}
object e {
def main(args: Array[String]): Unit = {
val conf =new SparkConf().setMaster("local").setAppName("PartialFumction")
val sc=new SparkContext(conf)
val rdd1=sc.parallelize(List(('a',1),('a',1),('b',1),('c',1)))
val rdd2=sc.parallelize(List(('a',1),('b',1),('d',1)))
//用intersection()求两个RDD的共同元素
rdd1.intersection(rdd2).collect.foreach(print)
}
}
package test
import org.apache.spark.{SparkConf, SparkContext}
object f {
def main(args: Array[String]): Unit = {
val conf =new SparkConf().setMaster("local").setAppName("PartialFumction")
val sc=new SparkContext(conf)
val rdd1=sc.parallelize(List(('a',1),('b',1),('c',1)))
val rdd2=sc.parallelize(List(('d',1),('e',1),('c',1)))
//用subtract 求rdd1和rdd2彼此的补集
rdd1.subtract(rdd2).collect.foreach(print)
rdd2.subtract(rdd1).collect.foreach(print)
// cartersian() 可以将2个集合的元素两两合成1一组,即求笛卡尔积
val rdd01=sc.makeRDD(List(1,3,5,3))
val rdd02=sc.makeRDD(List(2,4,5,1))
rdd01.cartesian(rdd02).collect.foreach(print)
//
}
}