JavaSpark | 算子



1.Transformations转换算子

  Transformations 类算子是一类算子(函数)叫做转换算子,如map,flatMap,reduceByKey 等。Transformations 算子是延迟执行,也叫懒加载执行。

1.1 filter算子

  filter:过滤符合条件的记录数,true 保留,false 过滤掉。
Operator_filter.scala

package com.shsxt.java_Test.core.transform_operator;


import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;

/**
 * filter
 * 过滤符合符合条件的记录数,true的保留,false的过滤掉。
 *
 */
public class Operator_filter {
   
	public static void main(String[] args) {
   
		/**
		 * SparkConf对象中主要设置Spark运行的环境参数。
		 * 1.运行模式
		 * 2.设置Application name
		 * 3.运行的资源需求
		 */
		SparkConf conf = new SparkConf();
		conf.setMaster("local");
		conf.setAppName("filter");
		/**
		 * JavaSparkContext对象是spark运行的上下文,是通往集群的唯一通道。
		 */
		JavaSparkContext jsc = new JavaSparkContext(conf);
		JavaRDD<String> lines = jsc.textFile("data/word.txt");
		JavaRDD<String> resultRDD = lines.filter(new Function<String, Boolean>() {
   
			/**
			 * 内部类定义规则,一般是重写方法
			 */
			@Override
			public Boolean call(String line) throws Exception {
   
				return !line.contains("shsxt");
			}
			
		});
		
		resultRDD.foreach(new VoidFunction<String>() {
   
			@Override
			public void call(String line) throws Exception {
   
				System.out.println(line);
			}
		});
		jsc.stop();
	}
}

hello tiantian
hello gzsxt
hello Spark
1.2 map算子

  map:将一个 RDD 中的每个数据项,通过 map 中的函数映射变为一个新的元素。
特点:输入一条,输出一条数据。

package com.shsxt.java_Test.core.transform_operator;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.VoidFunction;

/**
 * map 
 * 通过传入的函数处理每个元素,返回新的数据集。
 * 特点:输入一条,输出一条。
 * 
 * 
 * @author root
 *
 */
public class Operator_map {
   
	public static void main(String[] args) {
   
		SparkConf conf = new SparkConf();
		conf.setMaster("local");
		conf.setAppName("map");
		JavaSparkContext jsc = new JavaSparkContext(conf);
		JavaRDD<String> line = jsc.textFile("data/word.txt");
		JavaRDD<String> mapResult = line.map(new Function<String, String>() {
   
			@Override
			public String call(String s) throws Exception {
   
				return s+"~";
			} 
		});
		
		mapResult.foreach(new VoidFunction<String>() {
   
			@Override
			public void call(String t) throws Exception {
   
				System.out.println(t);
			}
		});
		
		jsc.stop();
	}
}

hello tiantian~
hello shsxt~
hello gzsxt~
hello Spark~
1.3 flatMap算子

   flatMap:先 mapflat。与 map 类似,每个输入项可以映射为 0 到多个输出项。

package com.shsxt.java_Test.core.transform_operator;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.VoidFunction;

import java.util.Arrays;
import java.util.Iterator;

/**
 * flatMap
 * 输入一条数据,输出0到多条数据。
 * @author root
 *
 */
public class Operator_flatMap {
   
	public static void main(String[] args) {
   
		SparkConf conf = new SparkConf();
		conf.setMaster("local");
		conf.setAppName("flatMap");

		JavaSparkContext jsc = new JavaSparkContext(conf);
		JavaRDD<String> lines = jsc.textFile("./data/word.txt",3);
		JavaRDD<String> flatMapResult = lines.flatMap(new FlatMapFunction<String, String>() {
   
			@Override
			public Iterator<String> call(String s) throws Exception {
   
				
				return Arrays.asList(s.split(" ")).iterator();
			}
			
		});

        flatMapResult.foreach(new VoidFunction<String>() {
   
			@Override
			public void call(String t) throws Exception {
   
				System.out.println(t);
			}
		});
		
		jsc.stop();
	}
}

hello
tiantian
hello
shsxt
hello
gzsxt
hello
Spark
1.4 sample算子

   sample:随机抽样算子,根据传进去的小数按比例进行有放回或者无放回的抽样。

package com.shsxt.java_Test.core.transform_operator;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFlatMapFunction;

import scala.Tuple2;

public class Operator_sample {
   
	public static void main(String[] args) {
   
		SparkConf conf = new SparkConf();
		conf.setMaster("local");
		conf.setAppName("sample");
		
		JavaSparkContext jsc = new JavaSparkContext(conf);
		JavaRDD<String> lines = jsc.textFile("data/word.txt");
		JavaPairRDD<String, Integer> flatMapToPair = lines.flatMapToPair(new PairFlatMapFunction<String, String, Integer>() {
   
			@Override
			public Iterator<Tuple2<String, Integer>> call(String t)
					throws Exception {
   
				List<Tuple2<String,Integer>> tupleList = new ArrayList<Tuple2<String,Integer>>();
				tupleList.add(new Tuple2<String,Integer>(t,1));
				return tupleList.iterator();
			}
		});

		JavaPairRDD<String, Integer> sampleResult = flatMapToPair.sample(true,0.3,4);

		sampleResult.foreach(x-> System.out.println(x));

		jsc.stop();
	}
}
(hello shsxt,1)
1.5 reduceByKey算子

   reduceByKey:将相同的 Key 根据相应的逻辑进行处理。

package com.shsxt.java_Test.core.transform_operator;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;

import java.util.Arrays;


public class Operator_reduceByKey {
   

    public static void main(String[] args) {
   
        SparkConf conf = new SparkConf();
        conf.setMaster("local").setAppName("countByKey");
        JavaSparkContext sc = new JavaSparkContext(conf);

        JavaPairRDD<String, Integer> parallelizePairs = sc.parallelizePairs(Arrays.asList(
                new Tuple2<>("a", 1),
                new Tuple2<>("a", 1),
                new Tuple2<>("a", 1),
                new Tuple2<>("a", 1),
                new Tuple2<>("a", 1),
                new Tuple2<>("a", 1),
                new Tuple2<>("a", 1)
        ), 2);
        
        parallelizePairs.reduceByKey(new Function2<Integer, Integer, Integer>() {
   
            @Override
            public Integer call(Integer v1, Integer v2) throws Exception {
   
                System.out.println("v1: " +v1 + " v2: " + v2);
                return v1 + v2;
            }
        }).foreach(new VoidFunction<Tuple2<String, Integer>>() {
   
            @Override
            public void call(Tuple2<String, Integer> tuple2) throws Exception {
   
                System.out.println(tuple2);
            }
        });
    }
}

v1: 1 v2: 1
v1: 2 v2: 1
v1: 1 v2: 1
v1: 2 v2: 1
v1: 3 v2: 1
v1: 3 v2: 4
(a,7)
1.6 sortByKey与sortBy算子

   sortByKey:作用在 K、V 格式的 RDD 上,对 key 进行升序或者降序排序。
javaSpark中没有sortBy算子。可以使用sortByKeymapToPair算子达到一样的效果。

package com.shsxt.java_Test.core.transform_operator;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;

import java.util.Arrays;
import java.util.Iterator;

public class Operator_sortByKey {
   
	public static void main(String[] args) {
   
		SparkConf conf = new SparkConf();
		conf.setMaster("local");
		conf.setAppName("sortByKey");
		JavaSparkContext jsc = new JavaSparkContext(conf);
		JavaRDD<String> lines = jsc.textFile("data/word.txt");
		JavaRDD<String> flatMap = lines.flatMap(new FlatMapFunction<String, String>() {
   
			@Override
			public Iterator<String> call(String t) throws Exception {
   
				return Arrays.asList(t.split(" ")).iterator();
			}
		});
		JavaPairRDD<String, Integer> mapToPair = flatMap.mapToPair(new PairFunction<String, String, Integer>() {
   
			@Override
			public Tuple2<String, Integer> call(String s) throws Exception {
   
				return new Tuple2<String, Integer>(s, 1);
			}
		});

        mapToPair.sortByKey().foreach(new VoidFunction<Tuple2<String, Integer>>() {
   
            @Override
            public void call(Tuple2<String, Integer> tuple2) throws Exception {
   
                System.out.println(tuple2);
            }
        });
		
		JavaPairRDD<String, Integer> reduceByKey = mapToPair.reduceByKey(new Function2<Integer, Integer, Integer>() {
   
			@Override
			public Integer call(Integer v1, Integer v2) throws Exception {
   
				return v1+v2;
			}
		});
		//想交换key与value,再排序,再交换回来
		reduceByKey.mapToPair(new PairFunction<Tuple2<String,Integer>, Integer, String>() {
   
			@Override
			public Tuple2<Integer, String> call(Tuple2<String, Integer> t)
					throws Exception {
   
				return new Tuple2<Integer, String>(t._2, t._1);
			}
		}).sortByKey(false).mapToPair(new PairFunction<Tuple2<Integer,String>, String, Integer>() {
   
			@Override
			public Tuple2<String, Integer> call(Tuple2<Integer, String> t)
					throws Exception {
   
				return new Tuple2<String,
  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

幼稚的人呐

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值