Spark算子(四)

Point 1:public class MapPartitonsWithIndexOperator {

package com.spark.operator;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.VoidFunction;

public class MapPartitonsWithIndexOperator {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setAppName("MapPartitonsWithIndexOperator").setMaster(
                "local[2]");
        JavaSparkContext sc = new JavaSparkContext(conf);

        // 准备一下数据
        List<String> names = Arrays
                .asList("xurunyun", "liangyongqi", "wangfei");

        JavaRDD<String> nameRDD = sc.parallelize(names,2);
        // 其实老师这个地方不写并行度2,默认其实它也是2 

        // parallelize并行集合的时候,指定了并行度为2,说白了就是numPartitions是2
        // 也就是说我们上面的三大女神会被分到不同的两个分区里面去!
        // 但是怎么分,我不知道,spark决定!!

        // 如果我想知道谁和谁分到了一组里面去?
        // MapPartitonsWithIndex这个算子可以拿到每个partition的index

        JavaRDD<String> nameWithPartitonIndex = nameRDD.mapPartitionsWithIndex(new Function2<Integer, Iterator<String>, Iterator<String>>() {

            private static final long serialVersionUID = 1L;

            @Override
            public Iterator<String> call(Integer index, Iterator<String> iterator)
                    throws Exception {
                List<String> list = new ArrayList<String>();
                while(iterator.hasNext()){
                    String name = iterator.next();
                    String result = index + " : " + name;
                    list.add(result);
                }
                return list.iterator();
            }
        }, true);

        nameWithPartitonIndex.foreach(new VoidFunction<String>() {

            private static final long serialVersionUID = 1L;

            @Override
            public void call(String result) throws Exception {
                System.out.println(result);
            }
        });

        sc.close();
    }
}

Point 2:MapPartitionsOperator

package com.spark.operator;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.VoidFunction;

// 理解里面final使用的原因!

public class MapPartitionsOperator {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setAppName("JoinOperator")
                .setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(conf);

        // 准备一下数据
        List<String> names = Arrays.asList("xurunyun","liangyongqi","wangfei");
        JavaRDD<String> nameRDD = sc.parallelize(names);

        final Map<String, Integer> scoreMap = new HashMap<String, Integer>();
        scoreMap.put("xurunyun", 150);
        scoreMap.put("liangyongqi", 100);
        scoreMap.put("wangfei", 90);

        // mapPartitions
        // map算子,一次就处理一个partition的一条数据!!!
        // mapPartitions算子,一次处理一个partition中所有的数据!!!

        // 推荐的使用场景!!!
        // 如果你的RDD的数据不是特别多,那么采用MapPartitions算子代替map算子,可以加快处理速度
        // 比如说100亿条数据,你一个partition里面就有10亿条数据,不建议使用mapPartitions,
        // 内存溢出

        JavaRDD<Integer> scoreRDD = nameRDD.mapPartitions(new FlatMapFunction<Iterator<String>, Integer>() {

            private static final long serialVersionUID = 1L;

            @Override
            public Iterable<Integer> call(Iterator<String> iterator)
                    throws Exception {
                List<Integer> list = new ArrayList<Integer>();

                while(iterator.hasNext()){
                    String name = iterator.next();
                    Integer score = scoreMap.get(name);
                    list.add(score);
                }

                return list;
            }
        });

        scoreRDD.foreach(new VoidFunction<Integer>() {

            private static final long serialVersionUID = 1L;

            @Override
            public void call(Integer score) throws Exception {
                System.out.println(score);
            }
        });

        sc.close();
    }
}

Point 3:MapOperator

package com.spark.operator;

import java.util.Arrays;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;

public class MapOperator {

    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setAppName("LineCount")
                .setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(conf);

        List<Integer> numbers = Arrays.asList(1,2,3,4,5);
        JavaRDD<Integer> numberRDD = sc.parallelize(numbers);
        // map对每个元素进行操作
        JavaRDD<Integer> results = numberRDD.map(new Function<Integer, Integer>() {

            private static final long serialVersionUID = 1L;

            @Override
            public Integer call(Integer number) throws Exception {
                return number * 10;
            }
        });

        results.foreach(new VoidFunction<Integer>() {

            private static final long serialVersionUID = 1L;

            @Override
            public void call(Integer result) throws Exception {
                System.out.println(result);
            }
        });

        sc.close();
    }
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值