package day06; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.FlatMapFunction; import scala.Tuple2; import java.util.*; //取前几个元素 public class myTake { public static void myTake(JavaSparkContext jsc){ JavaRDD<String> RDD1 = jsc.parallelize(Arrays.asList("aa", "aa", "bb", "cc", "dd")); List<String> take = RDD1.take(3); System.out.println(take); } //整合相同的key 取出对应的值 public static void myCountByKey(JavaSparkContext jsc){ List<Tuple2<String,String>> tuple2s = Arrays.asList(new Tuple2<String,String>("class2", "liao"), new Tuple2<String,String>("class2", "ao"), new Tuple2<String,String>("class2", "li"), new Tuple2<String,String>("class1", "lao")); JavaPairRDD javaPairRDD = jsc.parallelizePairs(tuple2s); Map<String,Long> map = javaPairRDD.countByKey(); for (Map.Entry<String, Long> entry:map.entrySet()){ System.out.println("k="+entry.getKey()+"v="+entry.getValue()); } } //保存文件至hdfs上 public static void mySaveASTextFile(JavaSparkContext jsc){ List<String> list = Arrays.asList("Hello World", "Hello scala", "Hello Java"); JavaRDD<String> parallelizeRDD = jsc.parallelize(list); JavaRDD<String> flatMapRDD = parallelizeRDD.flatMap( new FlatMapFunction<String, String>() { public Iterator<String> call(String v1) throws Exception { return Arrays.asList(v1.split(" ")).iterator(); } }); flatMapRDD.saveAsTextFile("hdfs://hadoop-1707-001:9000/save/test001"); System.out.println("保存成功"); } public static void main(String[] args){ SparkConf conf = new SparkConf(). setMaster("local").setAppName("MyAction_Java "); JavaSparkContext jsc = new JavaSparkContext(conf); // mySaveASTextFile(jsc); myCountByKey(jsc); jsc.stop(); } }
SPARK算子实例JAVA实现(take,CountByKey,SaveAsTextFile)
最新推荐文章于 2023-02-04 22:31:47 发布