import org.apache.spark.Partitioner; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.sql.SparkSession; import scala.Tuple2; import java.util.Arrays; import java.util.List; import java.util.regex.Pattern; /** * Created by hadoop on 17-9-20. */ public class JavaSparkTest { //数据拆分的方式 private static final Pattern SPACE = Pattern.compile(" "); public static void main(String[] args) throws Exception { //判断是否拿到对象 // if (args.length < 1) { // System.err.println("Usage: JavaWordCount <file>"); // System.exit(1); // } //sparkSession对象,为用户提供了一个统一的切入点来使用spark的功能 SparkSession spark = SparkSession .builder() .master("local") .appName("JavaWordCount") .getOrCreate(); System.out.println("star========="); JavaRDD<String> lines = spark.read().textFile("file:///usr/local/spark/wordcount.txt").javaRDD(); System.out.println("lines分区"+lines.getNumPartitions()); System.out.println("lines分区器"+lines.partitioner()); JavaRDD<String> words = lines.flatMap(s -> Arrays.asList(SPACE.split(s)).iterator()); words.collect().forEach(System.out::println); JavaPairRDD<String, Integer> ones = words.mapToPair(s -> new Tuple2<>(s, 1)); //自定义分区器 JavaPairRDD<String, Integer> tows=ones.partitionBy(new mupartition()); System.out.println("分区器:"+tows.getNumPartitions()+"--->"+tows.partitioner()); ones.collect().forEach(System.out::println); JavaPairRDD<String, Integer> counts = ones.reduceByKey((i1, i2) -> i1 + i2); System.out.println("counts分区"+counts.getNumPartitions()); System.out.println("counts分区器"+counts.partitioner()); List<Tuple2<String, Integer>> output = counts.collect(); for (Tuple2<?,?> tuple : output) { System.out.println(tuple._1() + ": " + tuple._2()); } spark.stop(); } static class mupartition extends Partitioner { @Override public int numPartitions() { return 3; } @Override public int getPartition(Object key) { int index=key.hashCode()%numPartitions(); if(index<0){ index=0; } return index; } } }