spark例子java_Java_spark简单例子

本文提供了多个使用Java操作Spark的示例,包括根据key对K-V类型RDD排序、RDD的subtract和subtractByKey操作、sumApprox的使用、取样、取前n个元素、取最大值等,展示了Spark在数据处理上的基本功能。
摘要由CSDN通过智能技术生成

import org.apache.spark.{SparkContext, SparkConf}

/**

* Created by spark on 15-1-19.

* 根据key对K-V类型的RDD进行排序获得新的RDD

*/

object SortByKey {

def main(args: Array[String]) {

val conf = new SparkConf().setAppName("spark-demo").setMaster("local")

val sc = new SparkContext(conf)

import org.apache.spark.SparkContext._

val a = sc.parallelize(List("dog","cat","owl","gnu","ant"))

val b = sc.parallelize(1 to a.count().toInt)

val c = a.zip(b)

//asc

c.sortByKey(true).collect().foreach(print)

//desc

c.sortByKey(false).collect().foreach(print)

}

}

/**

* Created by spark on 15-1-19.

* RDD1.subtract(RDD2):返回一个新的RDD,内容是:RDD1中存在的,RDD2中不存在的

*/

object Subtract {

def main(args: Array[String]) {

val conf = new SparkConf().setAppName("spark-demo").setMaster("local")

val sc = new SparkContext(conf)

import org.apache.spark.SparkContext._

val a = sc.parallelize(1 to 10)

val b = sc.parallelize(1 to 3)

//45678910

//a.subtract(b).collect().foreach(print)

val c = sc.parallelize(1 to 10)

val d = sc.parallelize(List(1,2,3,11))

//45678910

c.subtract(d).collect().foreach(print)

}

}

/**

* Created by spark on 15-1-19.

* RDD1.subtractByKey(RDD2):返回一个新的RDD,内容是:RDD1 key中存在的,RDD2 key中不存在的

*/

object SubtractByKey {

def main(args: Array[String]) {

val conf = new SparkConf().setAppName("spark-demo").setMaster("local")

val sc = new SparkContext(conf)

import org.apache.spark.SparkContext._

val a = sc.parallelize(List("dog","he","word","hello"))

val b = a.keyBy(_.length)

val c = sc.parallelize(List("cat","first","everyone"))

val d = c.keyBy(_.length)

//(2,he)(4,word)

b.subtractByKey(d).collect().foreach(print)

}

}

/**

* Created by spark on 15-1-19.

* sumApprox没有出现我希望的结果

*/

object SumAndSumApprox {

def main(args: Array[String]) {

val conf = new SparkConf().setAppName("spark-demo").setMaster("local")

val sc = new SparkContext(conf)

import org.apache.spark.SparkContext._

val a = sc.parallelize(1 to 1000000)

val b = a.sum()

val c = a.sumApprox(0L,0.9).getFinalValue()

println(b + " *** " + c)

}

}

/**

* Created by spark on 15-1-19.

* 取出RDD的前n个元素,以数组的形式返回

*/

object Take {

def main(args: Array[String]) {

val conf = new SparkConf().setAppName("spark-demo").setMaster("local")

val sc = new SparkContext(conf)

//import org.apache.spark.SparkContext._

val a = sc.parallelize(1 to 1000000)

//12345678910

a.take(10).foreach(print)

}

}

/**

* Created by spark on 15-1-19.

* 对RDD元素进行升序排序

* 取出前n个元素并以数组的形式放回

*/

object TakeOrdered {

def main(args: Array[String]) {

val conf = new SparkConf().setAppName("spark-demo").setMaster("local")

val sc = new SparkContext(conf)

//import org.apache.spark.SparkContext._

val a = sc.parallelize(List("ff","aa","dd","cc"))

//aacc

a.takeOrdered(2).foreach(print)

}

}

/**

* Created by spark on 15-1-19.

* 数据取样

*/

object TakeSample {

def main(args: Array[String]) {

val conf = new SparkConf().setAppName("spark-demo").setMaster("local")

val sc = new SparkContext(conf)

//import org.apache.spark.SparkContext._

val a = sc.parallelize(1 to 10000)

/**

* 9048

5358

5216

7301

6303

6179

6151

5304

8115

3869

*/

a.takeSample(true , 10 , 1).foreach(println)

}

}

/**

* Created by spark on 15-1-19.

* debug 详情信息显示

*/

object ToDebugString {

def main(args: Array[String]) {

val conf = new SparkConf().setAppName("spark-demo").setMaster("local")

val sc = new SparkContext(conf)

//import org.apache.spark.SparkContext._

val a = sc.parallelize(1 to 9)

val b = sc.parallelize(1 to 3)

val c = a.subtract(b)

c.toDebugString

}

}

/**

* Created by spark on 15-1-19.

* 获得前几个最大值

*/

object Top {

def main(args: Array[String]) {

val conf = new SparkConf().setAppName("spark-demo").setMaster("local")

val sc = new SparkContext(conf)

//import org.apache.spark.SparkContext._

val a = sc.parallelize(1 to 1000)

val c = a.top(10)

/**

*1000

999

998

997

996

995

994

993

992

991

*/

c.foreach(println)

}

}

/**

* Union == ++ 把两个RDD合并为一个新的RDD

*/

object Union {

def main(args: Array[String]) {

val conf = new SparkConf().setAppName("spark-demo").setMaster("local")

val sc = new SparkContext(conf)

//import org.apache.spark.SparkContext._

val a = sc.parallelize(1 to 3)

val b = sc.parallelize(3 to 5)

val c = a.union(b)

val d = a ++ b

/**

*123345

*/

c.collect().foreach(print)

/**

*123345

*/

d.collect().foreach(print)

}

}

--Java

package com.demo.sparkWordCount;

import java.util.ArrayList;

import java.util.Arrays;

import java.util.List;

import org.apache.spark.SparkConf;

import org.apache.spark.api.java.JavaPairRDD;

import org.apache.spark.api.java.JavaRDD;

import org.apache.spark.api.java.JavaSparkContext;

import org.apache.spark.api.java.function.VoidFunction;

import akka.japi.Function;

import scala.Tuple2;

/*

* Ming Z M LI

* */

public class FunctionDemo {

/*

* create Context

*/

public static JavaSparkContext createContext() {

SparkConf sparkConf = new SparkConf().setAppName("FunctionDemo").setMaster("local[*]");

JavaSparkContext ctx = new JavaSparkContext(sparkConf);

return ctx;

}

public static void main(String[] args) {

demo5();

}

/*

* RDD1.subtract(RDD2):返回一个新的RDD,内容是:RDD1中存在的,RDD2中不存在的

*/

public static void demo2() {

JavaSparkContext ctx = createContext();

List list1 = new ArrayList();

list1.add("hello1");

list1.add("hello2");

list1.add("hello3");

list1.add("hello4");

List list2 = new ArrayList();

list2.add("hello3");

list2.add("hello4");

list2.add("world5");

list2.add("world6");

JavaRDD a = ctx.parallelize(list1);

JavaRDD b = ctx.parallelize(list2);

a.subtract(b).foreach(new VoidFunction() {

public void call(String t) throws Exception {

System.out.println(t.toString());

}

});

}

/**

* Created by spark on 15-1-19. RDD1.subtractByKey(RDD2):返回一个新的RDD,内容是:RDD1

* key中存在的,RDD2 key中不存在的 foreach 结果带key (4, bird) (5, hello) (3, cat) output

* - (4,bird) (4,bird)

*/

public static void demo3() {

JavaSparkContext ctx = createContext();

JavaRDD a = ctx.parallelize(new ArrayList(Arrays.asList("cat", "hello", "bird", "bird")));

JavaRDD b = ctx.parallelize(new ArrayList(Arrays.asList("cat", "hello", "testing")));

JavaPairRDD c = a.keyBy(new org.apache.spark.api.java.function.Function() {

public Integer call(String v1) throws Exception {

return v1.length();

}

});

// c.foreach(new VoidFunction>(){

//

// public void call(Tuple2 t) throws Exception {

// // TODO Auto-generated method stub

// System.out.println("("+t._1+", "+t._2+")");

// }

// });

JavaPairRDD d = b.keyBy(new org.apache.spark.api.java.function.Function() {

public Integer call(String v1) throws Exception {

return v1.length();

}

});

c.subtract(d).foreach(new VoidFunction>() {

public void call(Tuple2 t) throws Exception {

// TODO Auto-generated method stub

System.out.println("(" + t._1 + ", " + t._2 + ")");

}

});

}

/**

* 取出RDD的前n个元素,以数组的形式返回

*/

public static void demo4() {

JavaSparkContext ctx = createContext();

JavaRDD a = ctx.parallelize(new ArrayList(Arrays.asList("1", "4", "2", "3")));

List b = a.take(3);

for (String c : b) {

System.out.println(c);

}

}

/**

* 获得前几个最大值 output - hello 3

*/

public static void demo5() {

JavaSparkContext ctx = createContext();

JavaRDD a = ctx.parallelize(new ArrayList(Arrays.asList("1", "hello", "2", "3")));

List b = a.top(2);

for (String c : b) {

System.out.println(c);

}

}

}

来源:https://www.cnblogs.com/MarchThree/p/5059649.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值