RDD : Resilient Distributed Dataset,弹性分布式数据集
是spark的基本数据结构,是不可变数据集。RDD中的数据集进行逻辑分区,每个分区可以单独在集群节点进行计算。可以包含任何java,scala,python和自定义类型。
RDD是只读的记录分区集合。RDD具有容错机制。
创建RDD方式,一、并行化一个现有集合。
hadoop 花费90%时间用户rw。
内存处理计算。在job间进行数据共享。内存的IO速率高于网络和disk的10 ~ 100之间。
内部包含5个主要属性
1.分区列表
2.针对每个split的计算函数。
3.对其他rdd的依赖列表
4.可选,如果是KeyValueRDD的话,可以带分区类。
5.可选,首选块位置列表(hdfs block location);
默认并发度
local.backend.defaultParallelism() = scheduler.conf.getInt("spark.default.parallelism", totalCores)
taskScheduler.defaultParallelism = backend.defaultParallelism()
sc.defaultParallelism =...; taskScheduler.defaultParallelism
defaultMinPartitions = math.min(defaultParallelism, 2)
sc.textFile(path,defaultMinPartitions) //1,2
RDD变换
返回指向新rdd的指针,在rdd之间创建依赖关系。每个rdd都有计算函数和指向父RDD的指针。
map() //对每个元素进行变换,应用变换函数
//(T)=>V
flatMap() //压扁,T => TraversableOnce[U]
reduceByKey(*) //按key聚合。
val conf = new SparkConf()
conf.setAppName("WordCountScala")
conf.setMaster("local[4]") ;
val sc = new SparkContext(conf)
val rdd1 = sc.textFile("d:/scala/test.txt",4)
val rdd2 = rdd1.flatMap(_.split(" ")) ;
//val rdd3 = rdd2.map(word=>{println("start") ;val t = (word,1) ;println(t + " : end") ; t})
val rdd3 = rdd2.mapPartitions(it=>{
import scala.collection.mutable.ArrayBuffer ;
val buf = ArrayBuffer[String]()
val tname = Thread.currentThread().getName
println(tname + " : mapPartitions start ");
for (e <- it) {
buf.+=("_" + e);
}
buf.iterator
});
val rdd5 = rdd3.map(word=>{
val tname = Thread.currentThread().getName
println(tname + " : map " + word);
(word,1)});
val rdd4 = rdd5.reduceByKey(_ + _)
val r = rdd4.collect()
r.foreach(println)
}
mapPartitions() //对每个分区进行应用变换,输入的Iterator,返回新的迭代器,可以对分区进行函数处理。
//Iterator<T> => Iterator<U>
mapPartitionsWithIndex(func) //同上,(Int, Iterator<T>) => Iterator<U>
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("WordCountScala")
conf.setMaster("local[4]") ;
val sc = new SparkContext(conf)
val rdd1 = sc.textFile("d:/scala/test.txt",4)
val rdd2 = rdd1.flatMap(_.split(" ")) ;
val rdd3 = rdd2.mapPartitionsWithIndex((index,it) => {
import scala.collection.mutable.ArrayBuffer;
val buf = ArrayBuffer[String]()
val tname = Thread.currentThread().getName
println(tname + " : " + index + " : mapPartitions start ");
for (e <- it) {
buf.+=("_" + e);
}
buf.iterator
});
val rdd5 = rdd3.map(word=>{
val tname = Thread.currentThread().getName
println(tname + " : map " + word);
(word,1)});
val rdd4 = rdd5.reduceByKey(_ + _)
val r = rdd4.collect()
r.foreach(println)
}
filter() //过滤器,(T)=>Boolean
//统计hello的个数
def wordCount3(args: Array[String]): Unit ={
val conf = new SparkConf();
// conf.setAppName("wc");
// conf.setMaster("local");
val sc = new SparkContext(conf);
val r = sc.textFile(args(0)).flatMap(line=>line.split(" ")).filter(_.equals("hello")).map((_,1)).reduceByKey(_ + _).collect();
r.foreach(println);
}
sample(withReplacement, fraction, seed) //采样返回采样的RDD子集。
//withReplacement 元素是否可以多次采样.
//fraction : 期望采样数量.[0,1]
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("WordCountScala")
conf.setMaster("local[4]") ;
val sc = new SparkContext(conf)
val rdd1 = sc.textFile("d:/scala/test.txt",4)
val rdd2 = rdd1.flatMap(_.split(" "))
val rdd3 = rdd2.sample(false,0.5)
rdd3.collect().foreach(println)
}
union() //类似于mysql union操作。
//select * from persons where id < 10
//union select * from id persons where id > 29 ;
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("WordCountScala")
conf.setMaster("local[4]") ;
val sc = new SparkContext(conf)
val rdd1 = sc.textFile("d:/scala/log.txt",4)
//所有error
val errorRDD = rdd1.filter(_.toLowerCase.contains("error"))
//所有warn行
val warnRDD = rdd1.filter(_.toLowerCase.contains("warn"));
val allRDD = errorRDD.union(warnRDD);
allRDD.collect().foreach(println)
}
intersection //交集,提取两个rdd中都含有的元素。
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("WordCountScala")
conf.setMaster("local[4]") ;
val sc = new SparkContext(conf)
val rdd1 = sc.textFile("d:/scala/log.txt",4)
//所有error
val errorRDD = rdd1.filter(_.toLowerCase.contains("error"))
//所有warn行
val warnRDD = rdd1.filter(_.toLowerCase.contains("warn"));
val intersecRDD = errorRDD.intersection(warnRDD);
intersecRDD.collect().foreach(println)
}
distinct([numTasks])) //去重,去除重复的元素。
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("WordCountScala")
conf.setMaster("local[4]") ;
val sc = new SparkContext(conf)
val rdd1 = sc.textFile("d:/scala/log.txt",4)
val rdd2 = rdd1.flatMap(_.split(" "))
val rdd3 = rdd2.distinct()
rdd3.collect().foreach(println)
}
groupByKey() //(K,V) => (K,Iterable<V>)
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("WordCountScala")
conf.setMaster("local[4]") ;
val sc = new SparkContext(conf)
val rdd1 = sc.textFile("d:/scala/stus.txt",4)
val rdd2 = rdd1.map(line=>{
val key = line.split(" ")(3)
(key,line)
})
val rdd3 = rdd2.groupByKey()
rdd3.collect().foreach(t=>{
val key = t._1;
println(key + " : ====================")
for (e <- t._2){
println(e)
}
})
}
aggregateByKey(zeroValue)(seqOp, combOp, [numTasks]) //按照key进行聚合
key:String U:Int = 0
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("WordCountScala")
conf.setMaster("local[4]") ;
val sc = new SparkContext(conf)
val rdd1 = sc.textFile("d:/scala/stus.txt",4)
val rdd2 = rdd1.flatMap(_.split(" "))
val rdd3 = rdd2.map((_,1))
def seq(a: Int, b: Int): Int = {
math.max(a, b)
}
def comb(a: Int, b: Int): Int = {
a + b
}
rdd3.aggregateByKey(3)(seq, (a:Int,b:Int)=>{
a + b
})
}
sortByKey //排序
join(otherDataset, [numTasks]) //连接,(K,V).join(K,W) =>(K,(V,W))
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("WordCountScala")
conf.setMaster("local[4]") ;
//名单
val sc = new SparkContext(conf)
val namesRDD1 = sc.textFile("d:/scala/names.txt");
val namesRDD2 = namesRDD1.map(line=>{
var arr = line.split(" ")
(arr(0).toInt,arr(1))
})
//总成绩
val scoreRDD1 = sc.textFile("d:/scala/scores.txt");
val scoreRDD2 = scoreRDD1.map(line => {
var arr = line.split(" ")
(arr(0).toInt, arr(1).toInt)
})
val rdd = namesRDD2.join(scoreRDD2)
rdd.collect().foreach(t=>{
println(t._1 + " : " + t._2)
})
}
cogroup //协分组
//(K,V).cogroup(K,W) =>(K,(Iterable<V>,Iterable<!-- <W> -->))
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("WordCountScala")
conf.setMaster("local[4]") ;
val sc = new SparkContext(conf)
val rdd1 = sc.textFile("d:/scala/cogroup-1.txt",4)
//K,V
val rdd2 = rdd1.map(line=>{
val arr = line.split(" ")
(arr(0),arr(1))
})
//K,W
val rdd3 = sc.textFile("d:/scala/cogroup-2.txt", 4)
//key,value
val rdd4 = rdd3.map(line => {
val arr = line.split(" ")
(arr(0), arr(1))
})
val rdd = rdd2.cogroup(rdd4)
rdd.collect().foreach(t=>{
println(t._1 + ":=================")
for( e <- t._2._1){
println(e)
}
for (e <- t._2._2) {
println(e)
}
})
}
cartesian(otherDataset) //笛卡尔积,RR[T] RDD[U] => RDD[(T,U)]
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("WordCountScala")
conf.setMaster("local[4]") ;
val sc = new SparkContext(conf)
val rdd1 = sc.parallelize(Array("tom","tomas","tomasle","tomson"))
val rdd2 = sc.parallelize(Array("1234","3456","5678","7890"))
val rdd = rdd1.cartesian(rdd2);
rdd.collect().foreach(t=>println(t))
}
pipe //将rdd的元素传递给脚本或者命令,执行结果返回形成新的RDD(在linux可用)
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("WordCountScala")
conf.setMaster("local[4]") ;
val sc = new SparkContext(conf)
val rdd = sc.parallelize(Array("file:///d:","file:///e:","file:///f:",3))
val rdd0 = rdd.pipe("ls ")
rdd0.collect().foreach(println)
}
coalesce(numPartitions) //减少分区
repartition //可增可减
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("WordCountScala")
conf.setMaster("local[8]") ;
val sc = new SparkContext(conf)
val rdd1 = sc.textFile("d:/scala/test.txt",4)
println("rdd1' parti : " + rdd1.partitions.length)
//降低分区
val rdd11 = rdd1.coalesce(5);
//再分区,可憎可减
val rdd111 = rdd11.repartition(5)
val rdd2 = rdd111.flatMap(_.split(" "))
println("rdd2' parti : " + rdd2.partitions.length)
//
val rdd3 = rdd2.map((_,1))
println("rdd3' parti : " + rdd3.partitions.length)
}
repartitionAndSortWithinPartitions(partitioner) //再分区并在分区内进行排序
RDD Action
collect() //收集rdd元素形成数组.
count() //统计rdd元素的个数
reduce() //聚合,返回一个值。
first //取出第一个元素take(1)
take //提取前三个
takeSample (withReplacement,num, [seed])
takeOrdered(n, [ordering])
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("WordCountScala")
conf.setMaster("local[8]") ;
val sc = new SparkContext(conf)
val rdd1 = sc.textFile("d:/scala/test.txt",4)
//println(rdd1.first())
rdd1.take(3).foreach(println)
}
saveAsTextFile(path) //保存到文件
saveAsSequenceFile(path) //保存成序列文件
saveAsObjectFile(path) (Java and Scala)
countByKey()
数据倾斜问题:
//重写key,多次组合
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("WordCountScala")
conf.setMaster("local[4]") ;
val sc = new SparkContext(conf)
val rdd1 = sc.textFile("d:/scala/test.txt",4)
rdd1.flatMap(_.split(" ")).map((_,1)).map(t=>{
val word = t._1
val r = Random.nextInt(100)
(word + "_" + r,1)
}).reduceByKey(_ + _,4).map(t=>{
val word = t._1;
val count = t._2;
val w = word.split("_")(0)
(w,count)
}).reduceByKey(_ + _,4).saveAsTextFile("d:/scala/out/lean");
}