算子:WordCount Rdd Map ActionRdd
WordCount
import org.apache.spark.rdd.RDD
import org.apache.spark.{Partition, SparkConf, SparkContext}
object wordcount {
def main(args: Array[String]): Unit = {
System.setProperty("hadoop.home.dir", "D:\\Hadoop2.6.0")
val conf = new SparkConf().setMaster("local[2]").setAppName("workcount")
val sc:SparkContext=SparkContext.getOrCreate(conf)
val rdd1:RDD[String]=sc.parallelize(List("hello world","hello java","Hello scala java"))
rdd1.flatMap(x=>x.split(" ")).map((_,1)).reduceByKey(_+_).collect().foreach(println)
val partitions:Array[Partition] = rdd1.partitions
println(partitions.length)
println("--------------------")
val lines = sc.textFile("in\\word.txt")
lines.collect.foreach(println)
val linesHDFS:RDD[String]=sc.textFile("hdfs://HadoopY:9000/kb09workspace/word.txt")
println("--------------------")
linesHDFS.collect.foreach(println)
}
}
Map
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object mapdemo {
def main(args: Array[String]): Unit = {
System.setProperty("hadoop.home.dir", "D:\\Hadoop2.6.0")
val conf:SparkConf = new SparkConf().setMaster("local[*]").setAppName("mapdemo")
val sc = SparkContext.getOrCreate(conf)
val rdd1:RDD[Int] = sc.makeRDD(1 to 9)
val rdd2:RDD[Int] = rdd1.map(_*2)
rdd2.collect.foreach(println)
println("-----------------")
val strRdd1:RDD[String] = sc.parallelize(List("kb01","kb02","kb03","kb04","spark","study"),2)
val strRdd2 = strRdd1.map(x=>(x,1))
strRdd2.collect.foreach(println)
println("-----------------")
val filterRdd1:RDD[Int] = sc.makeRDD(List(1,2,3,6,7,8),3)
val filterRdd2:RDD[Int] = filterRdd1.filter(_%2==0)
filterRdd2.collect.foreach(println)
println("-----------------")
val mapValuesRdd1:RDD[String] = sc.parallelize(List("tiger","lion","cat","panther","eagle"))
val mapValuesRdd2:RDD[(Int,String)] = mapValuesRdd1.map(x=>(x.length, x))
mapValuesRdd2.collect.foreach(println)
val mapvaluesRdd3:RDD[(Int,String)] = mapValuesRdd2.mapValues(x=>"_"+x+"_")
mapvaluesRdd3.collect.foreach(println)
println("--------reduceByKey--------")
val reduceByKeyRdd1:RDD[(Int,String)] = mapValuesRdd2.reduceByKey((a,b)=>a+b)
reduceByKeyRdd1.collect.foreach(println)
println("--------groupByKey--------")
val groupByKeyRdd:RDD[(Int,Iterable[String])] = mapValuesRdd2.groupByKey()
groupByKeyRdd.collect.foreach(println)
println("--------sortByKey--------")
val sortByKeyRdd = mapValuesRdd2.sortByKey()
sortByKeyRdd.collect.foreach(println)
println("--------union--------")
val u1 = sc.parallelize(1 to 3)
val u2 = sc.parallelize(2 to 4)
u1.union(u2).collect.foreach(x=>print(x+"\t"))
println
u1.intersection(u2).collect.foreach(x=>print(x+"\t"))
println
println("--------join--------")
val j1 = sc.parallelize(List("a","b","c")).map(x=>(x,1))
val j2 = sc.parallelize(List("apple","banana","cylon")).map(x=>(x,1))
j1.join(j2).collect.foreach(println)
println
j1.leftOuterJoin(j2).collect.foreach(println)
println
j1.rightOuterJoin(j2).collect.foreach(println)
}
}
Rdd
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object rdddemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[3]").setAppName("rdddemo")
val sc = new SparkContext(conf)
val rdd1:RDD[Int] = sc.parallelize(List(1,2,3,4,5,6,7,8,9,9,6,4,3,7))
val rdd2:RDD[Int] = rdd1.distinct
println(rdd1.partitions.length)
println(rdd2.partitions.length)
val rdd3 = rdd1.distinct(2)
println(rdd3.partitions.length)
rdd2.collect.foreach(x=>print(x+"\t"))
println()
rdd3.collect.foreach(x=>print(x+"\t"))
}
}
ActionRdd
import org.apache.spark.{SparkConf, SparkContext}
object ActionRddDemo {
def main(args: Array[String]): Unit = {
System.setProperty("hadoop.home.dir", "D:\\Hadoop2.6.0")
val conf = new SparkConf().setMaster("local[*]").setAppName("actionrdddemo")
val sc = new SparkContext(conf)
val rdd1 = sc.parallelize(1 to 100)
val sum = rdd1.reduce((x,y)=>{println(x,y);x+y})
println("总和"+sum)
rdd1.saveAsTextFile("hdfs://HadoopY:9000/kb09workspace/ActionDemoRdd1.txt")
}
}
Java版本
filter
package sparkStu;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import java.util.List;
public class filterJava {
public static void main(String[] args) {
System.setProperty("hadoop.home.dir","D:\\Hadoop2.6.0");
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("filtermap");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD<String> lines = sc.textFile("in/sample.txt");
JavaRDD<String> filterRdd = lines.filter(new Function<String, Boolean>() {
@Override
public Boolean call(String s) throws Exception {
return s.contains("zks");
}
});
List<String> collect = filterRdd.collect();
for (String str : collect) {
System.out.println(str);
}
}
}
parallelize
package sparkStu;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import java.util.Arrays;
import java.util.List;
public class parallelizeJava {
public static void main(String[] args) {
System.setProperty("hadoop.home.dir","D:\\Hadoop2.6.0");
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("java1");
JavaSparkContext sc = new JavaSparkContext(conf);
List<String> strings = Arrays.asList("hello world", "hello scala", "hello spark");
JavaRDD<String> rdd1 = sc.parallelize(strings);
List<String> collect = rdd1.collect();
for (String value : collect) {
System.out.println(value);
}
JavaRDD<String> stringJavaRDD = sc.textFile("in/word.txt");
List<String> collect1 = stringJavaRDD.collect();
for (String value : collect1){
System.out.println(value);
}
}
}