Spark Streaming从Kafka自定义时间间隔内实时统计行数、TopN并将结果存到hbase中

一、统计kafka的topic在10秒间隔内生产数据的行数并将统计结果存入到hbase中
先在hbase中建立相应的表:
create 'linecount','count'

开启kafka集群并建立相应的topic:
[hadoop@h71 kafka_2.10-0.8.2.0]$ bin/kafka-topics.sh --create --zookeeper h71:2181,h72:2181,h73:2181 --replication-factor 3 --partitions 2 --topic test

启动生产者:

[hadoop@h71 kafka_2.10-0.8.2.0]$ bin/kafka-console-producer.sh --broker-list h71:9092,h72:9092,h73:9092 --topic test 


java代码:

  1. import java.text.SimpleDateFormat;  
  2. import java.util.Arrays;  
  3. import java.util.Date;  
  4. import java.util.HashMap;  
  5. import java.util.HashSet;  
  6. import java.util.Iterator;  
  7. import java.util.Map;  
  8. import java.util.Set;  
  9.   
  10. import kafka.serializer.StringDecoder;  
  11.   
  12. import org.apache.hadoop.conf.Configuration;  
  13. import org.apache.hadoop.hbase.HBaseConfiguration;  
  14. import org.apache.hadoop.hbase.client.HTable;  
  15. import org.apache.hadoop.hbase.client.Put;  
  16. import org.apache.spark.SparkConf;  
  17. import org.apache.spark.api.java.JavaPairRDD;  
  18. import org.apache.spark.api.java.function.FlatMapFunction;  
  19. import org.apache.spark.api.java.function.Function2;  
  20. import org.apache.spark.api.java.function.PairFunction;  
  21. import org.apache.spark.api.java.function.VoidFunction;  
  22. import org.apache.spark.streaming.Durations;  
  23. import org.apache.spark.streaming.api.java.JavaDStream;  
  24. import org.apache.spark.streaming.api.java.JavaPairDStream;  
  25. import org.apache.spark.streaming.api.java.JavaPairInputDStream;  
  26. import org.apache.spark.streaming.api.java.JavaStreamingContext;  
  27. import org.apache.spark.streaming.kafka.KafkaUtils;  
  28.   
  29. import scala.Tuple2;  
  30.   
  31. public class KafkaDirectWordCountPersistHBase {  
  32.     private static String beginTime = null;  
  33.     private static int cishu = 0;  
  34.     private static int interval = 0;  
  35.     private static String rowkey = null;  
  36.     public static Configuration getConfiguration() {  
  37.         Configuration conf = HBaseConfiguration.create();  
  38.         conf.set("hbase.rootdir""hdfs://192.168.8.71:9000/hbase");  
  39.         conf.set("hbase.zookeeper.quorum""192.168.8.71");  
  40.         return conf;  
  41.     }  
  42.     public static void insert(String tableName, String rowKey, String family,  
  43.             String quailifer, String value) {  
  44.         try {  
  45.             HTable table = new HTable(getConfiguration(), tableName);  
  46.             Put put = new Put(rowKey.getBytes());  
  47.             put.add(family.getBytes(), quailifer.getBytes(), value.getBytes()) ;  
  48.             table.put(put);  
  49.         } catch (Exception e) {  
  50.             e.printStackTrace();  
  51.         }  
  52.     }  
  53.       
  54.     public static void main(String[] args) {  
  55.         SparkConf conf = new SparkConf().setAppName("wordcount").setMaster("local[2]");  
  56.         //这里设置每多少秒计算一次,我这里设置的间隔是10秒  
  57.         interval = 10;  
  58. //      JavaStreamingContext jssc = new JavaStreamingContext(conf, new Duration(10000));    //毫秒  
  59.         JavaStreamingContext jssc = new JavaStreamingContext(conf,Durations.seconds(interval)); //秒  
  60.         // 首先要创建一份kafka参数map  
  61.         Map<String, String> kafkaParams = new HashMap<String, String>();  
  62.         // 我们这里是不需要zookeeper节点的,所以我们这里放broker.list  
  63.         kafkaParams.put("metadata.broker.list""192.168.8.71:9092,192.168.8.72:9092,192.168.8.73:9092");  
  64.         // 然后创建一个set,里面放入你要读取的Topic,这个就是我们所说的,它给你做的很好,可以并行读取多个topic  
  65.         Set<String> topics = new HashSet<String>();  
  66.         topics.add("test");  
  67.         JavaPairInputDStream<String,String> lines = KafkaUtils.createDirectStream(  
  68.             jssc,   
  69.             String.class// key类型  
  70.             String.class// value类型  
  71.             StringDecoder.class// 解码器  
  72.             StringDecoder.class,  
  73.             kafkaParams,   
  74.             topics);  
  75.         //在第一个间隔的时候其实并非一定等于10秒的,而是小于等于10秒的  
  76.         SimpleDateFormat time = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");  
  77.         java.util.Date date=new java.util.Date();  
  78.         System.out.println("StreamingContext started->"+time.format(new Date()));  
  79.         beginTime=time.format(date);  
  80.           
  81.         JavaDStream<String> words = lines.flatMap(new FlatMapFunction<Tuple2<String,String>, String>(){  
  82.             private static final long serialVersionUID = 1L;  
  83.             @Override  
  84.             public Iterable<String> call(Tuple2<String,String> tuple) throws Exception {  
  85.                 return Arrays.asList(tuple._2.split("/n")); //按行进行分隔  
  86.             }  
  87.         });  
  88.           
  89.         JavaPairDStream<String, Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>(){  
  90.             private static final long serialVersionUID = 1L;  
  91.             @Override  
  92.             public Tuple2<String, Integer> call(String word) throws Exception {  
  93.                 return new Tuple2<String, Integer>("line"1);  
  94.             }  
  95.         });  
  96.           
  97.         JavaPairDStream<String, Integer> wordcounts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>(){  
  98.             private static final long serialVersionUID = 1L;  
  99.             @Override  
  100.             public Integer call(Integer v1, Integer v2) throws Exception {  
  101.                 return v1 + v2;  
  102.             }  
  103.         });  
  104.         wordcounts.print();  
  105.         wordcounts.foreachRDD(new VoidFunction<JavaPairRDD<String,Integer>>() {  
  106.             private static final long serialVersionUID = 1L;  
  107.             @Override  
  108.             public void call(JavaPairRDD<String, Integer> wordcountsRDD) throws Exception {  
  109.                 SimpleDateFormat time = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");  
  110.                 java.util.Date date=new java.util.Date();   
  111.                 System.out.println("endTime1-->"+time.format(new Date()));   //yyyy-MM-dd HH:mm:ss形式  
  112.                 final long endTime1 = System.currentTimeMillis();  
  113.                 System.out.println("endTime1-->"+endTime1);  //时间戳格式  
  114.                 final String endTime=time.format(date);  
  115.                 cishu++;  
  116.                 System.out.println("cishu-->"+cishu);  
  117.                 if(cishu == 1){  
  118.                     rowkey = beginTime+"__"+endTime;  
  119.                     insert("linecount", rowkey, "count""sum""0") ;  
  120.                 }else{  
  121.                     SimpleDateFormat hh1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");  
  122.                     Date date1 = hh1.parse(endTime);  
  123.                     long hb=date1.getTime();  
  124.                     long a2 = hb - interval*1000;  
  125.                     SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");  
  126.                     Date date2 = new Date(a2);  
  127.                     String beginTime1 = simpleDateFormat.format(date2);  
  128.                     rowkey = beginTime1+"__"+endTime;  
  129.                     insert("linecount", rowkey, "count""sum""0") ;  
  130.                 }  
  131.                 //foreachPartition这个方法好像和kafka的topic的分区个数有关系,如果你topic有两个分区,则这个方法会执行两次  
  132.                 wordcountsRDD.foreachPartition(new VoidFunction<Iterator<Tuple2<String,Integer>>>() {  
  133.                     private static final long serialVersionUID = 1L;  
  134.                     @Override  
  135.                     public void call(Iterator<Tuple2<String, Integer>> wordcounts) throws Exception {  
  136.                         Tuple2<String,Integer> wordcount = null;  
  137. //注意:这里是利用了在hbase中对同一rowkey同一列再查入数据会覆盖前一次值的特征,所以hbase中linecount表的版本号必须是1,建表的时候如果你不修改版本号的话默认是1  
  138.                         while(wordcounts.hasNext()){  
  139.                             wordcount = wordcounts.next();  
  140.                             insert("linecount", rowkey, "count""sum", wordcount._2.toString()) ;  
  141.                         }  
  142.                     }  
  143.                 });  
  144.             }  
  145.         });  
  146.         jssc.start();  
  147.         jssc.awaitTermination();  
  148.         jssc.close();  
  149.     }  
  150. }  

在myeclipse中运行该代码后在kafka的生产者终端输入数据:
hello world
ni hao a
hello spark
注意:如果你是将我这三行复制过去的话还要再按一下回车键,否则的话你实际输入的是两行

过一段时间后再输入数据:
i
love
you
baby
,
come
on


查看linecount表:

  1. hbase(main):187:0> scan 'linecount'  
  2. ROW                                                          COLUMN+CELL                                                                                                                                                                       
  3.  2017-07-26 17:27:56__2017-07-26 17:28:00                    column=count:sum, timestamp=1501061244619, value=0                                                                                                                                
  4.  2017-07-26 17:28:00__2017-07-26 17:28:10                    column=count:sum, timestamp=1501061252476, value=3                                                                                                                                
  5.  2017-07-26 17:28:10__2017-07-26 17:28:20                    column=count:sum, timestamp=1501061262405, value=0                                                                                                                                
  6.  2017-07-26 17:28:20__2017-07-26 17:28:30                    column=count:sum, timestamp=1501061272420, value=7                                                                                                                                
  7. 4 row(s) in 0.3150 seconds  

二、统计kafka的topic在10秒间隔内生产数据的TopN并将统计结果存入到hbase中
在hbase中创建相应的Top3表:
create 'KafkaTop','TopN'


java代码:

  1. import java.text.SimpleDateFormat;  
  2. import java.util.Arrays;  
  3. import java.util.Comparator;  
  4. import java.util.Date;  
  5. import java.util.HashMap;  
  6. import java.util.HashSet;  
  7. import java.util.Iterator;  
  8. import java.util.Map;  
  9. import java.util.Set;  
  10. import java.util.TreeMap;  
  11.   
  12. import kafka.serializer.StringDecoder;  
  13.   
  14. import org.apache.hadoop.conf.Configuration;  
  15. import org.apache.hadoop.hbase.HBaseConfiguration;  
  16. import org.apache.hadoop.hbase.client.HTable;  
  17. import org.apache.hadoop.hbase.client.Put;  
  18. import org.apache.spark.SparkConf;  
  19. import org.apache.spark.api.java.JavaPairRDD;  
  20. import org.apache.spark.api.java.function.FlatMapFunction;  
  21. import org.apache.spark.api.java.function.Function2;  
  22. import org.apache.spark.api.java.function.PairFunction;  
  23. import org.apache.spark.api.java.function.VoidFunction;  
  24. import org.apache.spark.streaming.Durations;  
  25. import org.apache.spark.streaming.api.java.JavaDStream;  
  26. import org.apache.spark.streaming.api.java.JavaPairDStream;  
  27. import org.apache.spark.streaming.api.java.JavaPairInputDStream;  
  28. import org.apache.spark.streaming.api.java.JavaStreamingContext;  
  29. import org.apache.spark.streaming.kafka.KafkaUtils;  
  30.   
  31. import scala.Tuple2;  
  32.   
  33. /** 
  34.  * @author huiqiang 
  35.  * 2017-7-28 11:24 
  36.  */  
  37. public class KafkaSparkTopN {  
  38.     private static String beginTime = null;  
  39.     private static String hbasetable = "KafkaTop";  //将处理结果存到hbase中的表名,在运行程序之前就得存在  
  40.     private static int cishu = 0;  
  41.     private static int interval = 10;   //这里设置每多少秒计算一次,我这里设置的间隔是10秒  
  42.     private static int n = 0;  
  43.     private static String rowkey = null;  
  44.     public static int K = 3;    //你想Top几就设置几  
  45.       
  46.     //定义treeMap来保持统计结果,由于treeMap是按key升序排列的,这里要人为指定Comparator以实现倒排  
  47.     public static TreeMap<Integer, String> treeMap = new TreeMap<Integer, String>(new Comparator<Integer>() {  
  48.     @Override  
  49.     public int compare(Integer x, Integer y) {  
  50.         return y.compareTo(x);  
  51.     }  
  52.   });  
  53.       
  54.     //连接hbase  
  55.     public static Configuration getConfiguration() {  
  56.         Configuration conf = HBaseConfiguration.create();  
  57.         conf.set("hbase.rootdir""hdfs://192.168.8.71:9000/hbase");  
  58.         conf.set("hbase.zookeeper.quorum""192.168.8.71");  
  59.         return conf;  
  60.     }  
  61.     public static void insert2(String tableName,String rowKey,String family,String quailifer,String value){  
  62.         try {  
  63.             HTable table1 = new HTable(getConfiguration(), tableName);  
  64.             Put put = new Put(rowKey.getBytes());  
  65.             put.add(family.getBytes(), quailifer.getBytes(), value.getBytes());  
  66.             table1.put(put);  
  67.         } catch (Exception e) {  
  68.             e.printStackTrace();  
  69.         }  
  70.     }  
  71.     public static void insert3(String tableName,String rowKey,String family){  
  72.         try {  
  73.             HTable table1 = new HTable(getConfiguration(), tableName);  
  74.             Put put = new Put(rowKey.getBytes());  
  75.             for (int i = 1; i <= K; i++) {  
  76.                 put.add(family.getBytes(), ("Top"+i).getBytes(), "null".getBytes());  
  77.             }  
  78.             table1.put(put);  
  79.         } catch (Exception e) {  
  80.             e.printStackTrace();  
  81.         }  
  82.     }  
  83.       
  84.     public static void main(String[] args) {  
  85.         SparkConf conf = new SparkConf().setAppName("wordcount").setMaster("local[2]");  
  86. //      JavaStreamingContext jssc = new JavaStreamingContext(conf, new Duration(10000));    //毫秒  
  87.         JavaStreamingContext jssc = new JavaStreamingContext(conf,Durations.seconds(interval)); //秒  
  88.         // 首先要创建一份kafka参数map  
  89.         Map<String, String> kafkaParams = new HashMap<String, String>();  
  90.         // 我们这里是不需要zookeeper节点的,所以我们这里放broker.list  
  91.         kafkaParams.put("metadata.broker.list""192.168.8.71:9092,192.168.8.72:9092,192.168.8.73:9092");  
  92.         // 然后创建一个set,里面放入你要读取的Topic,这个就是我们所说的,它给你做的很好,可以并行读取多个topic  
  93.         Set<String> topics = new HashSet<String>();  
  94.         topics.add("test");  
  95.         JavaPairInputDStream<String,String> lines = KafkaUtils.createDirectStream(  
  96.             jssc,   
  97.             String.class// key类型  
  98.             String.class// value类型  
  99.             StringDecoder.class// 解码器  
  100.             StringDecoder.class,  
  101.             kafkaParams,   
  102.             topics);  
  103.         //在第一个间隔的时候其实并非一定等于10秒的,而是小于等于10秒的  
  104.         SimpleDateFormat time = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");  
  105.         java.util.Date date=new java.util.Date();  
  106.         System.out.println("StreamingContext started->"+time.format(new Date()));  
  107.         beginTime=time.format(date);  
  108.           
  109.         JavaDStream<String> words = lines.flatMap(new FlatMapFunction<Tuple2<String,String>, String>(){  
  110.             private static final long serialVersionUID = 1L;  
  111.             @Override  
  112.             public Iterable<String> call(Tuple2<String,String> tuple) throws Exception {  
  113.                 return Arrays.asList(tuple._2.split(" "));  //按空格进行分隔  
  114.             }  
  115.         });  
  116.           
  117.         JavaPairDStream<String, Integer> pairs = words.mapToPair(new PairFunction<String, String, Integer>(){  
  118.             private static final long serialVersionUID = 1L;  
  119.             @Override  
  120.             public Tuple2<String, Integer> call(String word) throws Exception {  
  121.                 return new Tuple2<String, Integer>(word, 1);  
  122.             }  
  123.         });  
  124.           
  125.         JavaPairDStream<String, Integer> wordcounts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>(){  
  126.             private static final long serialVersionUID = 1L;  
  127.             @Override  
  128.             public Integer call(Integer v1, Integer v2) throws Exception {  
  129.                 return v1 + v2;  
  130.             }  
  131.         });  
  132.         wordcounts.print();  
  133.         wordcounts.foreachRDD(new VoidFunction<JavaPairRDD<String,Integer>>() {  
  134.             private static final long serialVersionUID = 1L;  
  135.             @Override  
  136.             public void call(JavaPairRDD<String, Integer> wordcountsRDD) throws Exception {  
  137.                 n = 0;  
  138.                 treeMap.clear();  
  139.                 SimpleDateFormat time = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");  
  140.                 java.util.Date date=new java.util.Date();   
  141.                 System.out.println("endTime1-->"+time.format(new Date()));   //yyyy-MM-dd HH:mm:ss形式  
  142.                 final long endTime1 = System.currentTimeMillis();  
  143.                 System.out.println("endTime1-->"+endTime1);  //时间戳格式  
  144.                 final String endTime=time.format(date);  
  145.                 cishu++;  
  146.                 System.out.println("cishu-->"+cishu);  
  147.                 if(cishu == 1){  
  148.                     rowkey = beginTime+"__"+endTime;  
  149.                     insert3(hbasetable, rowkey, "TopN");  
  150.                 }else{  
  151.                     SimpleDateFormat hh1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");  
  152.                     Date date1 = hh1.parse(endTime);  
  153.                     long hb=date1.getTime();  
  154.                     long a2 = hb - interval*1000;  
  155.                     SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");  
  156.                   Date date2 = new Date(a2);  
  157.                   String beginTime1 = simpleDateFormat.format(date2);  
  158.                     rowkey = beginTime1+"__"+endTime;  
  159.                     insert3(hbasetable, rowkey, "TopN");  
  160.                 }  
  161.                 //foreachPartition这个方法好像和kafka的topic的分区个数有关系,如果你topic有两个分区,则这个方法会执行两次  
  162.                 wordcountsRDD.foreachPartition(new VoidFunction<Iterator<Tuple2<String,Integer>>>() {  
  163.                     private static final long serialVersionUID = 1L;  
  164.                     @Override  
  165.                     public void call(Iterator<Tuple2<String, Integer>> wordcounts) throws Exception {  
  166.                         Tuple2<String,Integer> wordcount = null;  
  167.                         while(wordcounts.hasNext()){  
  168.                             n++;  
  169.                             wordcount = wordcounts.next();  
  170.                         if (treeMap.containsKey(wordcount._2)){  
  171.                             String value = treeMap.get(wordcount._2) + "," + wordcount._1;  
  172.                             treeMap.remove(wordcount._2);  
  173.                             treeMap.put(wordcount._2, value);  
  174.                         }else {  
  175.                             treeMap.put(wordcount._2, wordcount._1);  
  176.                         }  
  177.                         if(treeMap.size() > K) {  
  178.                             treeMap.remove(treeMap.lastKey());  
  179.                         }  
  180.                         }  
  181.                     }  
  182.                 });  
  183.           if(n!=0){  
  184.             int y = 0;  
  185.             for(int num : treeMap.keySet()) {  
  186.                 y++;  
  187. //注意:这里是利用了在hbase中对同一rowkey同一列再查入数据会覆盖前一次值的特征,所以hbase中KafkaTop表的版本号必须是1,建表的时候如果你不修改版本号的话默认是1  
  188.                     insert2(hbasetable, rowkey, "TopN""Top"+y, treeMap.get(num)+" "+num);  
  189.                 }  
  190.             }   
  191.             }  
  192.         });  
  193.         jssc.start();  
  194.         jssc.awaitTermination();  
  195.         jssc.close();  
  196.     }  
  197. }  

在myeclipse中运行该代码后在kafka的生产者终端输入数据:
hello world
hello hadoop
hello hive
hello hadoop
hello world
hello world
hbase hive


在myeclipse的打印台会输出:

  1. -------------------------------------------  
  2. Time: 1501214340000 ms  
  3. -------------------------------------------  
  4. (hive,2)  
  5. (hello,6)  
  6. (world,3)  
  7. (hadoop,2)  
  8. (hbase,1)  
  9. endTime1-->2017-07-28 11:59:00  
  10. endTime1-->1501214340455  
  11. cishu-->1  
  12. 。。。。。。省略  
  13. -------------------------------------------  
  14. Time: 1501214350000 ms  
  15. -------------------------------------------  
  16. endTime1-->2017-07-28 11:59:10  
  17. endTime1-->1501214350090  
  18. cishu-->2  

查看hbase表:
  1. hbase(main):018:0> scan 'KafkaTop'  
  2. ROW                                                          COLUMN+CELL                                                                                                                                                                       
  3.  2017-07-28 11:58:55__2017-07-28 11:59:00                    column=TopN:Top1, timestamp=1501101768643, value=hello 6                                                                                                                          
  4.  2017-07-28 11:58:55__2017-07-28 11:59:00                    column=TopN:Top2, timestamp=1501101768661, value=world 3                                                                                                                          
  5.  2017-07-28 11:58:55__2017-07-28 11:59:00                    column=TopN:Top3, timestamp=1501101768679, value=hadoop,hive 2                                                                                                                    
  6.  2017-07-28 11:59:00__2017-07-28 11:59:10                    column=TopN:Top1, timestamp=1501101770921, value=null                                                                                                                             
  7.  2017-07-28 11:59:00__2017-07-28 11:59:10                    column=TopN:Top2, timestamp=1501101770921, value=null                                                                                                                             
  8.  2017-07-28 11:59:00__2017-07-28 11:59:10                    column=TopN:Top3, timestamp=1501101770921, value=null                                                                                                                             
  9. 2 row(s) in 0.3140 seconds  

三、下面这个不是Spark Streaming的,是来自网上的一个列子,相当于离线分析TopN,仅做参考
来自:http://blog.csdn.net/accptanggang/article/details/52924970
下面是源数据hui.txt,我存放在了我的Windows电脑的桌面的spark文件夹里,取出最大的前3个数字:
2
4
1
6
8
10
34
89


java代码:

  1. import java.util.List;  
  2. import org.apache.spark.SparkConf;  
  3. import org.apache.spark.api.java.JavaPairRDD;  
  4. import org.apache.spark.api.java.JavaRDD;  
  5. import org.apache.spark.api.java.JavaSparkContext;  
  6. import org.apache.spark.api.java.function.Function;  
  7. import org.apache.spark.api.java.function.PairFunction;  
  8.   
  9. import scala.Tuple2;  
  10.   
  11. public class SparkTop {  
  12.     public static void main(String[] args) {  
  13.         SparkConf conf=new SparkConf().setAppName("Top3").setMaster("local");  
  14.         JavaSparkContext sc=new JavaSparkContext(conf);  
  15.   
  16.         //JavaRDD<String> lines = sc.textFile("hdfs://tgmaster:9000/in/nums2");  
  17.         JavaRDD<String> lines = sc.textFile("C:\\Users\\huiqiang\\Desktop\\spark\\hui.txt");  
  18.   
  19.         //经过map映射,形成键值对的形式。  
  20.         JavaPairRDD<Integer, Integer> mapToPairRDD = lines.mapToPair(new PairFunction<String, Integer, Integer>() {  
  21.             private static final long serialVersionUID = 1L;  
  22.             public Tuple2<Integer, Integer> call(String num) throws Exception {  
  23.                 // TODO Auto-generated method stub  
  24.                 int numObj=Integer.parseInt(num);  
  25.                 Tuple2<Integer, Integer> tuple2 = new Tuple2<Integer, Integer>(numObj, numObj);  
  26.                 return tuple2;  
  27.             }  
  28.         });  
  29.         /** 
  30.          * 1、通过sortByKey()算子,根据key进行降序排列 
  31.          * 2、排序完成后,通过map()算子获取排序之后的数字 
  32.          */  
  33.         JavaRDD<Integer> resultRDD = mapToPairRDD.sortByKey(false).map(new Function<Tuple2<Integer,Integer>, Integer>() {  
  34.             private static final long serialVersionUID = 1L;  
  35.   
  36.             public Integer call(Tuple2<Integer, Integer> v1) throws Exception {  
  37.                 // TODO Auto-generated method stub  
  38.                 return v1._1;  
  39.             }  
  40.         });  
  41.         //通过take()算子获取排序后的前3个数字  
  42.         List<Integer> nums = resultRDD.take(3);  
  43.         for (Integer num : nums) {  
  44.             System.out.println(num);  
  45.         }  
  46.         sc.close();  
  47.     }  
  48. }  

在myeclipse中运行结果为:
89
34
10
  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值