直接上代码
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.VoidFunction;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class demo1 {
/*
* 创建JavaSparkContext对象
*/
public static JavaSparkContext createContext(){
SparkConf sc = new SparkConf();
sc.setMaster("local[*]").setAppName("sparktest");
JavaSparkContext jsc = new JavaSparkContext(sc);
return jsc;
}
public static void main(String[] args) {
// method1();
// method2();
method3();
}
/*
* demo1 展示两个集合中只在第一个集合中存在的数据
* list1 : hello1,hello2,hello3,hello4
* list2 : hello3,hello4,world5,world5
* */
public static void method1(){
JavaSparkContext jsc = createContext();
List<String> list1 = new ArrayList <>();
list1.add("hello1");
list1.add("hello2");
list1.add("hello3");
list1.add("hello4");
List<String> list2 = new ArrayList <>();
list2.add("hello3");
list2.add("hello4");
list2.add("world5");
list2.add("world6");
JavaRDD<String> a = jsc.parallelize(list1);
JavaRDD<String> b = jsc.parallelize(list2);
JavaRDD<String> subtract = a.subtract(b);
subtract.foreach(new VoidFunction<String>() {
@Override
public void call(String s) throws Exception {
System.out.println(s);
}
});
}
/**
* 取出RDD的前n个元素,以集合的形式返回
*/
public static void method2(){
JavaSparkContext jsc = createContext();
JavaRDD<String> rdd = jsc.parallelize(new ArrayList <String>(Arrays.asList("3", "2", "5", "6", "8", "0")));
List <String> take = rdd.take(3);
for (String s: take) {
System.out.println(s);
}
}
/**
* 获得前几个最大值 output - hello 3
*/
public static void method3(){
JavaSparkContext jsc = createContext();
JavaRDD<String> rdd = jsc.parallelize(new ArrayList <String>(Arrays.asList("3", "2", "5", "6", "8", "0")));
List <String> top = rdd.top(4);
for (String s: top) {
System.out.println(s);
}
}
}
pom文件
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.10</artifactId>
<version>1.6.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.10</artifactId>
<version>1.6.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.10</artifactId>
<version>1.6.0</version>
</dependency>
</dependencies>
<build>
<pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
<plugins>
<!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
<plugin>
<artifactId>maven-clean-plugin</artifactId>
<version>3.1.0</version>
</plugin>
<!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.0</version>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.22.1</version>
</plugin>
<plugin>
<artifactId>maven-jar-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-install-plugin</artifactId>
<version>2.5.2</version>
</plugin>
<plugin>
<artifactId>maven-deploy-plugin</artifactId>
<version>2.8.2</version>
</plugin>
<!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
<plugin>
<artifactId>maven-site-plugin</artifactId>
<version>3.7.1</version>
</plugin>
<plugin>
<artifactId>maven-project-info-reports-plugin</artifactId>
<version>3.0.0</version>
</plugin>
</plugins>
</pluginManagement>
</build>