【spark】算子:WordCount Rdd Map ActionRdd

14 篇文章 0 订阅
10 篇文章 1 订阅

算子:WordCount Rdd Map ActionRdd

WordCount

import org.apache.spark.rdd.RDD
import org.apache.spark.{Partition, SparkConf, SparkContext}

object wordcount {
  def main(args: Array[String]): Unit = {
    System.setProperty("hadoop.home.dir", "D:\\Hadoop2.6.0")
    val conf = new SparkConf().setMaster("local[2]").setAppName("workcount")
    val sc:SparkContext=SparkContext.getOrCreate(conf)
    val rdd1:RDD[String]=sc.parallelize(List("hello world","hello java","Hello scala java"))
    rdd1.flatMap(x=>x.split(" ")).map((_,1)).reduceByKey(_+_).collect().foreach(println)
    val partitions:Array[Partition] = rdd1.partitions
    //partitions.foreach(println)
    println(partitions.length)
    println("--------------------")
    val lines = sc.textFile("in\\word.txt")
    lines.collect.foreach(println)
    val linesHDFS:RDD[String]=sc.textFile("hdfs://HadoopY:9000/kb09workspace/word.txt")
    println("--------------------")
    linesHDFS.collect.foreach(println)
  }
}

Map

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object mapdemo {
  def main(args: Array[String]): Unit = {
    System.setProperty("hadoop.home.dir", "D:\\Hadoop2.6.0")
    val conf:SparkConf = new SparkConf().setMaster("local[*]").setAppName("mapdemo")
    val sc = SparkContext.getOrCreate(conf)
    val rdd1:RDD[Int] = sc.makeRDD(1 to 9)
    val rdd2:RDD[Int] = rdd1.map(_*2)
    rdd2.collect.foreach(println)

    println("-----------------")
    val strRdd1:RDD[String] = sc.parallelize(List("kb01","kb02","kb03","kb04","spark","study"),2)
    val strRdd2 = strRdd1.map(x=>(x,1))
    strRdd2.collect.foreach(println)

    println("-----------------")
    val filterRdd1:RDD[Int] = sc.makeRDD(List(1,2,3,6,7,8),3)
    val filterRdd2:RDD[Int] = filterRdd1.filter(_%2==0)
    filterRdd2.collect.foreach(println)

    println("-----------------")
    val mapValuesRdd1:RDD[String] = sc.parallelize(List("tiger","lion","cat","panther","eagle"))
    val mapValuesRdd2:RDD[(Int,String)] = mapValuesRdd1.map(x=>(x.length, x))
    mapValuesRdd2.collect.foreach(println)
    val mapvaluesRdd3:RDD[(Int,String)] = mapValuesRdd2.mapValues(x=>"_"+x+"_")
    mapvaluesRdd3.collect.foreach(println)

    println("--------reduceByKey--------")
    val reduceByKeyRdd1:RDD[(Int,String)] = mapValuesRdd2.reduceByKey((a,b)=>a+b)
    reduceByKeyRdd1.collect.foreach(println)

    println("--------groupByKey--------")
    val groupByKeyRdd:RDD[(Int,Iterable[String])] = mapValuesRdd2.groupByKey()
    groupByKeyRdd.collect.foreach(println)

    println("--------sortByKey--------")
    val sortByKeyRdd = mapValuesRdd2.sortByKey()
    sortByKeyRdd.collect.foreach(println)

    println("--------union--------")
    val u1 = sc.parallelize(1 to 3)
    val u2 = sc.parallelize(2 to 4)
    u1.union(u2).collect.foreach(x=>print(x+"\t"))
    println
    u1.intersection(u2).collect.foreach(x=>print(x+"\t"))
    println

    println("--------join--------")
    val j1 = sc.parallelize(List("a","b","c")).map(x=>(x,1))
    val j2 = sc.parallelize(List("apple","banana","cylon")).map(x=>(x,1))
    j1.join(j2).collect.foreach(println)
    println
    j1.leftOuterJoin(j2).collect.foreach(println)
    println
    j1.rightOuterJoin(j2).collect.foreach(println)
  }
}

Rdd

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object rdddemo {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local[3]").setAppName("rdddemo")
    val sc = new SparkContext(conf)

    val rdd1:RDD[Int] = sc.parallelize(List(1,2,3,4,5,6,7,8,9,9,6,4,3,7))
    val rdd2:RDD[Int] = rdd1.distinct
    println(rdd1.partitions.length)
    println(rdd2.partitions.length)
    val rdd3 = rdd1.distinct(2)
    println(rdd3.partitions.length)
    rdd2.collect.foreach(x=>print(x+"\t"))
    println()
    rdd3.collect.foreach(x=>print(x+"\t"))
  }
}

ActionRdd

import org.apache.spark.{SparkConf, SparkContext}

object ActionRddDemo {
  def main(args: Array[String]): Unit = {
    System.setProperty("hadoop.home.dir", "D:\\Hadoop2.6.0")
    val conf = new SparkConf().setMaster("local[*]").setAppName("actionrdddemo")
    val sc = new SparkContext(conf)
    val rdd1 = sc.parallelize(1 to 100)
    val sum = rdd1.reduce((x,y)=>{println(x,y);x+y})
    println("总和"+sum)

    //rdd1.saveAsTextFile("in/rdd1.txt")
    rdd1.saveAsTextFile("hdfs://HadoopY:9000/kb09workspace/ActionDemoRdd1.txt")
  }
}

Java版本

filter

package sparkStu;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import java.util.List;

public class filterJava {
    public static void main(String[] args) {
        System.setProperty("hadoop.home.dir","D:\\Hadoop2.6.0");
        SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("filtermap");
        JavaSparkContext sc = new JavaSparkContext(conf);
        JavaRDD<String> lines = sc.textFile("in/sample.txt");
        JavaRDD<String> filterRdd = lines.filter(new Function<String, Boolean>() {
            @Override
            public Boolean call(String s) throws Exception {
                return s.contains("zks");
            }
        });

        List<String> collect = filterRdd.collect();
        for (String str : collect) {
            System.out.println(str);
        }
    }
}

parallelize

package sparkStu;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import java.util.Arrays;
import java.util.List;

public class parallelizeJava {
    public static void main(String[] args) {
        System.setProperty("hadoop.home.dir","D:\\Hadoop2.6.0");
        SparkConf conf = new SparkConf().setMaster("local[*]").setAppName("java1");
        JavaSparkContext sc = new JavaSparkContext(conf);
        List<String> strings = Arrays.asList("hello world", "hello scala", "hello spark");
        JavaRDD<String> rdd1 = sc.parallelize(strings);
        List<String> collect = rdd1.collect();
        for (String value : collect) {
            System.out.println(value);
        }

        JavaRDD<String> stringJavaRDD = sc.textFile("in/word.txt");
        List<String> collect1 = stringJavaRDD.collect();
        for (String value : collect1){
            System.out.println(value);
        }
    }
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值