Scala---Java 转换(API)

1.mapPartitions

scala

 /*
    map
    mapPartitions ,一次处理一个分区的所有数据,
    处理完毕之后再把数据返回这个分区对应的子RDD的新的分区,
    因为一次执行,跑的一个分区的所有数据
   真正执行起来比Map要快很多,而且更加适合在一些场景下,
   希望创建某些对象比较少的情况 ,但是它的弊端是 一次处理的数据是整个分区的数据,当内存
   不是很充分的 情况下,发生内存溢出oom: out of memary
   */
  def mapPartitions(sc: SparkContext): Unit = {
    val rdd: RDD[String] = sc.parallelize(Array("hello world", "hi boy", "hello girl", "hi hello"))
    val resRdd = rdd.mapPartitions(it => {
      val buffer = new ArrayBuffer[String]()
      while (it.hasNext) {
        val split = it.next().split(" ")
        for (word <- it) {
          buffer += word + "-@"
        }
      }
      buffer.toIterator
    })
    resRdd.foreach(println(_))
  }

Java

private static void mapPartitions(JavaSparkContext sc) {
		JavaRDD<String> pRdd = sc.parallelize(Arrays.asList("hello world", "hi boy", "hello girl", "hi hello"));
		;
		JavaRDD<String> resRdd = pRdd.mapPartitions(new FlatMapFunction<Iterator<String>, String>() {

			/**
			 * 
			 */
			private static final long serialVersionUID = 1L;

			@Override
			public Iterator<String> call(Iterator<String> it) throws Exception {

				ArrayList<String> res = new ArrayList<String>();

				while (it.hasNext()) {
					String[] split = it.next().split(" ");
					for (String words : split) {

						res.add(words + "@");
					}
					// System.out.println(res);
				}
				return res.iterator();
			}
		});
		System.out.println(resRdd.collect());
		// System.out.println(resRdd.toString());//×

2.mapPartitionsWithIndex

scala

def mapPartitionsWithIndex(sc: SparkContext): Unit = {

    val arrRdd: RDD[String] = sc.parallelize(Array("hello world", "hi boy", "hello girl", "hi hello"))
    // 多了一个index,这个index是一个索引,是每个分区的编号索引,编号是从0 开始的
    val resRdd = arrRdd.mapPartitionsWithIndex((index, it) => {
      //index是分区编号
      val bf = new ArrayBuffer[String]()
      while (it.hasNext) {
        val split = it.next().split(" ")
        for (word <- it) {
          bf += word + "_" + index
        }
      }
      bf.toIterator
    })
    resRdd.foreach(println(_))
  }

Java

private static void mapPartitionsWithIndex(JavaSparkContext sc) {
		/*
		 * JavaRDD<String> arrRdd =
		 * sc.parallelize(Arrays.asList("hello world","hi boy","hello girl"
		 * ,"hi hello"));
		 */
		JavaRDD<Integer> rdd1 = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), 3);
		JavaRDD<Tuple2<Integer, Integer>> resAdd = rdd1.mapPartitionsWithIndex(
				new Function2<Integer, Iterator<Integer>, Iterator<Tuple2<Integer, Integer>>>() {

					/**
					 * 
					 */
					private static final long serialVersionUID = 1L;

					@Override
					public Iterator<Tuple2<Integer, Integer>> call(Integer index, Iterator<Integer> it)
							throws Exception {
						ArrayList<Tuple2<Integer, Integer>> tuple2s = new ArrayList<Tuple2<Integer, Integer>>();
						while (it.hasNext()) {
							Integer next = it.next();
							tuple2s.add(new Tuple2<Integer, Integer>(index, next));
						}
						return tuple2s.iterator();
					}
				}, false);
		resAdd.foreach(new VoidFunction<Tuple2<Integer, Integer>>() {

			@Override
			public void call(Tuple2<Integer, Integer> t) throws Exception {
				System.out.println(t);
			}
		});

	}

println("=======================================================================================")

private static void mapPartitionsWithIndex1(JavaSparkContext sc) {
		JavaRDD<String> rdd1 = sc.parallelize(Arrays.asList("hello world", "hi boy", "hello girl", "hi hello"));
		JavaRDD<Tuple2<Integer, String>> resRdd = rdd1
				.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<Tuple2<Integer, String>>>() {

					@Override
					public Iterator<Tuple2<Integer, String>> call(Integer index, Iterator<String> it) throws Exception {
						ArrayList<Tuple2<Integer, String>> tuple2 = new ArrayList<Tuple2<Integer, String>>();
						while (it.hasNext()) {
							String[] split = it.next().split(" ");
							for (String words : split) {
								tuple2.add(new Tuple2<Integer, String>(index, words));
							}
						}
						return tuple2.iterator();
					}
				}, false);
		resRdd.foreach(new VoidFunction<Tuple2<Integer, String>>() {

			@Override
			public void call(Tuple2<Integer, String> t) throws Exception {
				System.out.println(t);
			}
		});

	}


3.sample

Scala

// 取抽样 : 中国人的平均身高, 抽样,数学问题,可以用很少的数据,代替全局的情况
  def sample(sc: SparkContext): Unit = {
    val numRdd: RDD[Int] = sc.parallelize(Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
    //withReplacement 有无放回   fraction 抽样的比例  seed 每隔几个一次
    //val resRdd1: RDD[Int] = numRdd.sample(false,0.5)
    val resRdd2: RDD[Int] = numRdd.sample(true, 0.8)
    //resRdd1.foreach(print(_))
    resRdd2.foreach(println(_))
  }

Java

private static void sample(JavaSparkContext sc) {
		JavaRDD<Integer> numRdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 0));
		JavaRDD<Integer> resRdd = numRdd.sample(false, 0.5);// true?
		resRdd.foreach(new VoidFunction<Integer>() {

			/**
			 * 
			 */
			private static final long serialVersionUID = 1L;

			@Override
			public void call(Integer t) throws Exception {
				System.out.println(t.toString());
			}
		});
	}

 

4.reparation

scala
 // 重新规划分区数: repartition 還有一個coalesce ,合并分区
  /*
 def repartition(numPartitions: Int)(implicit ord: Ordering[T] = null): RDD[T] = withScope {
    coalesce(numPartitions, shuffle = true)
  }
  repartition 相当于调用了coalesce 里面shuffle = true,就是这种情况下会发生shuffle
  def coalesce(numPartitions: Int, shuffle: Boolean = false),默认情况下没有shuffle
  repartition 是coalesce 特殊情况

  为什么shuffle为true 就可以增加分区数呢??
  shuffle就是数据可以从rdd的一个分区的数据可以被转换为多个子rdd的分区里面

  问题:这两个算子的使用场景??
  reparation:主要用来增加分区,增加并行计算的粒度,如果并行度很低,这样运行的速度比较慢
  coalesce:合并分区,减少并行度 ,第一考虑什么时候减少并行度,如果资源本身就比较少,你搞太多的并行度,没意义,
     搞这么多并行度,一个并行度对应一个task,也在有资源的浪费
  filter:过滤操作之后,往往会导致数据不均匀,这个时候可以尝试合并分区,因为task的数量变少了,减少了对资源的消耗
   */
  def reparation(sc: SparkContext): Unit = {
    val numRdd: RDD[Int] = sc.parallelize(Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 0), 3)
    println(numRdd.getNumPartitions)
    //3
    //shuffle=true 增大分区个数  提高并行度
    val rdd1 = numRdd.repartition(5) //5
    println(rdd1.getNumPartitions)
    //3
    val c1Rdd: RDD[Int] = numRdd.coalesce(5, false) //3
    println(c1Rdd.getNumPartitions)
    val c2Rdd: RDD[Int] = numRdd.coalesce(5, true) //5
    println(c2Rdd.getNumPartitions)
    val c3Rdd: RDD[Int] = numRdd.coalesce(2, false) //2
    println(c3Rdd.getNumPartitions)

  }

 

Java

	private static void reparation(JavaSparkContext sc) {
		JavaRDD<Integer> numRdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 0), 3);
		System.out.println(numRdd.getNumPartitions());// 3
		JavaRDD<Integer> rdd1 = numRdd.repartition(5);
		System.out.println(rdd1.getNumPartitions());// 5
		JavaRDD<Integer> c1Rdd = numRdd.coalesce(5, false);
		System.out.println(c1Rdd.getNumPartitions());// 3
		JavaRDD<Integer> c2Rdd = numRdd.coalesce(5, true);
		System.out.println(c2Rdd.getNumPartitions());// 5
		JavaRDD<Integer> c3Rdd = numRdd.coalesce(2, false);
		System.out.println(c3Rdd.getNumPartitions());// 2

	}

 

5.union

scala

 def union(sc: SparkContext): Unit = {
    val numRdd1: RDD[Int] = sc.parallelize(Array(1, 2, 3, 4, 5), 1)
    val numRdd2: RDD[Int] = sc.parallelize(Array(4, 5, 6, 7, 8), 3)
    val resRdd: RDD[Int] = numRdd1.union(numRdd2)
    resRdd.foreach(println(_))
    println("resNumPartitons  :" + resRdd.getNumPartitions)
  }

java

	private static void union(JavaSparkContext sc) {
		JavaRDD<Integer> rdd1 = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5), 1);
		JavaRDD<Integer> rdd2 = sc.parallelize(Arrays.asList(4, 5, 6, 7, 8), 3);
		JavaRDD<Integer> resRdd = rdd1.union(rdd2);
		// System.out.println(resRdd.collect());
		System.out.println(resRdd.getNumPartitions());
		resRdd.foreach(new VoidFunction<Integer>() {

			/**
			 * 
			 */
			private static final long serialVersionUID = 1L;

			@Override
			public void call(Integer t) throws Exception {
				System.out.println(t);

			}
		});
	}

 

6.cartesian

scala

def cartesian(sc: SparkContext): Unit = {
    val rdd01: RDD[String] = sc.parallelize(Array("张学友", "刘德华", "黎明", "郭富城"), 2)
    val rdd02: RDD[String] = sc.parallelize(Array("咖喱", "辣椒", "香肠"), 3)
    val resRdd: RDD[(String, String)] = rdd01.cartesian(rdd02)
    resRdd.foreach(println(_))
    println(resRdd.getNumPartitions)
  }

java

private static void cartesian(JavaSparkContext sc) {
		JavaRDD<String> rdd1 = sc.parallelize(Arrays.asList("一色", "雪乃", "团子", "彩加"), 2);
		JavaRDD<String> rdd2 = sc.parallelize(Arrays.asList("比企谷", "八幡", "大老师"), 3);
		JavaPairRDD<String, String> resRdd = rdd1.cartesian(rdd2);
		System.out.println(resRdd.getNumPartitions());
		System.out.println(resRdd.collect());
		resRdd.foreach(new VoidFunction<Tuple2<String, String>>() {

			@Override
			public void call(Tuple2<String, String> t) throws Exception {
				System.out.println(t);
			}
		});

	}

 

 

scala: package ; main  

package day03

import org.apache.log4j.{Level, Logger}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable.ArrayBuffer

object Demo1{

def main(args: Array[String]): Unit = {

    Logger.getLogger("org").setLevel(Level.ERROR)

    val conf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[2]")
    val sc = new SparkContext(conf)
    // mapPartitions(sc);
    // mapPartitionsWithIndex(sc);
    //	sample(sc);
    //reparation(sc);
    //	union(sc); // 数学计算里面 union  取并集
    cartesian(sc); //取笛卡尔积

    sc.stop()
  }
}

 

java : package ; main

 

package day03;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;

import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.VoidFunction;

import scala.Tuple2;

public class Demo01 {
	private static Function2 index;

	public static void main(String[] args) {
		Logger.getLogger("org").setLevel(Level.ERROR);

		SparkConf conf = new SparkConf().setAppName("Demo01").setMaster("local[2]");
		JavaSparkContext sc = new JavaSparkContext(conf);

		// mapPartitions(sc);
		// mapPartitionsWithIndex(sc);
		mapPartitionsWithIndex1(sc);
		// sample(sc);
		// reparation(sc);
		// union(sc); // 数学计算里面 union 取并集
		// cartesian(sc);// 取笛卡尔积

		sc.stop();
	}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值