MapReduce TopK统计加排序

  • Hadoop技术内幕中指出Top K算法有两步,一是统计词频,二是找出词频最高的前K个词。在网上找了很多MapReduce的Top K案例,这些案例都只有排序功能,所以自己写了个案例。

    这个案例分两个步骤,第一个是就是wordCount案例,二就是排序功能。

    一,统计词频

    01. 1 package TopK;
    02. 2 import java.io.IOException;
    03. 3 import java.util.StringTokenizer;
    04. 4
    05. 5 import org.apache.hadoop.conf.Configuration;
    06. 6 import org.apache.hadoop.fs.Path;
    07. 7 import org.apache.hadoop.io.IntWritable;
    08. 8 import org.apache.hadoop.io.Text;
    09. 9 import org.apache.hadoop.mapreduce.Job;
    10. 10 import org.apache.hadoop.mapreduce.Mapper;
    11. 11 import org.apache.hadoop.mapreduce.Reducer;
    12. 12 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    13. 13 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    14. 14
    15. 15 /**
    16. 16  * 统计词频
    17. 17  * @author zx
    18. 18  * zhangxian1991@<a href="http://www.it165.net/qq/" target="_blank" class="keylink">qq</a>.com
    19. 19  */
    20. 20 public class WordCount {
    21. 21    
    22. 22     /**
    23. 23      * 读取单词
    24. 24      * @author zx
    25. 25      *
    26. 26      */
    27. 27     public static class Map extends Mapper<Object,Text,Text,IntWritable>{
    28. 28
    29. 29         IntWritable count = new IntWritable(1);
    30. 30        
    31. 31         @Override
    32. 32         protected void map(Object key, Text value, Context context)
    33. 33                 throws IOException, InterruptedException {
    34. 34             StringTokenizer st = new StringTokenizer(value.toString());
    35. 35             while(st.hasMoreTokens()){   
    36. 36                 String word = st.nextToken().replaceAll("\"""").replace("'""").replace(".""");
    37. 37                 context.write(new Text(word), count);
    38. 38             }
    39. 39         }
    40. 40        
    41. 41     }
    42. 42    
    43. 43     /**
    44. 44      * 统计词频
    45. 45      * @author zx
    46. 46      *
    47. 47      */
    48. 48     public static class Reduce extends Reducer<Text,IntWritable,Text,IntWritable>{
    49. 49
    50. 50         @SuppressWarnings("unused")
    51. 51         @Override
    52. 52         protected void reduce(Text key, Iterable<IntWritable> values,Context context)
    53. 53                 throws IOException, InterruptedException {
    54. 54             int count = 0;
    55. 55             for (IntWritable intWritable : values) {
    56. 56                 count ++;
    57. 57             }
    58. 58             context.write(key,new IntWritable(count));
    59. 59         }
    60. 60        
    61. 61     }
    62. 62    
    63. 63     @SuppressWarnings("deprecation")
    64. 64     public static boolean run(String in,String out) throws IOException, ClassNotFoundException, InterruptedException{
    65. 65        
    66. 66         Configuration conf = new Configuration();
    67. 67        
    68. 68         Job job = new Job(conf,"WordCount");
    69. 69         job.setJarByClass(WordCount.class);
    70. 70         job.setMapperClass(Map.class);
    71. 71         job.setReducerClass(Reduce.class);
    72. 72        
    73. 73         // 设置Map输出类型
    74. 74         job.setMapOutputKeyClass(Text.class);
    75. 75         job.setMapOutputValueClass(IntWritable.class);
    76. 76
    77. 77         // 设置Reduce输出类型
    78. 78         job.setOutputKeyClass(Text.class);
    79. 79         job.setOutputValueClass(IntWritable.class);
    80. 80
    81. 81         // 设置输入和输出目录
    82. 82         FileInputFormat.addInputPath(job, new Path(in));
    83. 83         FileOutputFormat.setOutputPath(job, new Path(out));
    84. 84        
    85. 85         return job.waitForCompletion(true);
    86. 86     }
    87. 87    
    88. 88 }

    二,排序 并求出频率最高的前K个词

    001. 1 package TopK;
    002. 2
    003. 3 import java.io.IOException;
    004. 4 import java.util.Comparator;
    005. 5 import java.util.Map.Entry;
    006. 6 import java.util.Set;
    007. 7 import java.util.StringTokenizer;
    008. 8 import java.util.TreeMap;
    009. 9 import java.util.regex.Pattern;
    010. 10
    011. 11 import org.apache.hadoop.conf.Configuration;
    012. 12 import org.apache.hadoop.fs.Path;
    013. 13 import org.apache.hadoop.io.IntWritable;
    014. 14 import org.apache.hadoop.io.Text;
    015. 15 import org.apache.hadoop.mapreduce.Job;
    016. 16 import org.apache.hadoop.mapreduce.Mapper;
    017. 17 import org.apache.hadoop.mapreduce.Reducer;
    018. 18 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
    019. 19 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
    020. 20 import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
    021. 21 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
    022. 22
    023. 23 /**
    024. 24  * 以单词出现的频率排序
    025. 25  *
    026. 26  * @author zx
    027. 27  * zhangxian1991@<a href="http://www.it165.net/qq/" target="_blank" class="keylink">qq</a>.com
    028. 28  */
    029. 29 public class Sort {
    030. 30
    031. 31     /**
    032. 32      * 读取单词(词频 <a href="http://www.it165.net/edu/ebg/" target="_blank" class="keylink">word</a>)
    033. 33      *
    034. 34      * @author zx
    035. 35      *
    036. 36      */
    037. 37     public static class Map extends Mapper<Object, Text, IntWritable, Text> {
    038. 38
    039. 39         // 输出key 词频
    040. 40         IntWritable outKey = new IntWritable();
    041. 41         Text outValue = new Text();
    042. 42
    043. 43         @Override
    044. 44         protected void map(Object key, Text value, Context context)
    045. 45                 throws IOException, InterruptedException {
    046. 46
    047. 47             StringTokenizer st = new StringTokenizer(value.toString());
    048. 48             while (st.hasMoreTokens()) {
    049. 49                 String element = st.nextToken();
    050. 50                 if (Pattern.matches("\\d+", element)) {
    051. 51                     outKey.set(Integer.parseInt(element));
    052. 52                 else {
    053. 53                     outValue.set(element);
    054. 54                 }
    055. 55             }
    056. 56
    057. 57             context.write(outKey, outValue);
    058. 58         }
    059. 59
    060. 60     }
    061. 61
    062. 62     /**
    063. 63      * 根据词频排序
    064. 64      *
    065. 65      * @author zx
    066. 66      *
    067. 67      */
    068. 68     public static class Reduce extends
    069. 69             Reducer<IntWritable, Text, Text, IntWritable> {
    070. 70        
    071. 71         private static MultipleOutputs<Text, IntWritable> mos = null;
    072. 72        
    073. 73         //要获得前K个频率最高的词
    074. 74         private static final int k = 10;
    075. 75        
    076. 76         //用TreeMap存储可以利用它的排序功能
    077. 77         //这里用 MyInt 因为TreeMap是对key排序,且不能唯一,而词频可能相同,要以词频为Key就必需对它封装
    078. 78         private static TreeMap<MyInt, String> tm = new TreeMap<MyInt, String>(new Comparator<MyInt>(){
    079. 79             /**
    080. 80              * 默认是从小到大的顺序排的,现在修改为从大到小
    081. 81              * @param o1
    082. 82              * @param o2
    083. 83              * @return
    084. 84              */
    085. 85             @Override
    086. 86             public int compare(MyInt o1, MyInt o2) {
    087. 87                 return o2.compareTo(o1);
    088. 88             }
    089. 89            
    090. 90         }) ;
    091. 91        
    092. 92         /*
    093. 93          * 以词频为Key是要用到reduce的排序功能
    094. 94          */
    095. 95         @Override
    096. 96         protected void reduce(IntWritable key, Iterable<Text> values,
    097. 97                 Context context) throws IOException, InterruptedException {
    098. 98             for (Text text : values) {
    099. 99                 context.write(text, key);
    100. 100                 tm.put(new MyInt(key.get()),text.toString());
    101. 101                
    102. 102                 //TreeMap以对内部数据进行了排序,最后一个必定是最小的
    103. 103                 if(tm.size() > k){
    104. 104                     tm.remove(tm.lastKey());
    105. 105                 }
    106. 106                
    107. 107             }
    108. 108         }
    109. 109
    110. 110         @Override
    111. 111         protected void cleanup(Context context)
    112. 112                 throws IOException, InterruptedException {
    113. 113             String path = context.getConfiguration().get("topKout");
    114. 114             mos = new MultipleOutputs<Text, IntWritable>(context);
    115. 115             Set<Entry<MyInt, String>> set = tm.entrySet();
    116. 116             for (Entry<MyInt, String> entry : set) {
    117. 117                 mos.write("topKMOS"new Text(entry.getValue()), new IntWritable(entry.getKey().getValue()), path);
    118. 118             }
    119. 119             mos.close();
    120. 120         }
    121. 121
    122. 122        
    123. 123        
    124. 124     }
    125. 125
    126. 126     @SuppressWarnings("deprecation")
    127. 127     public static void run(String in, String out,String topKout) throws IOException,
    128. 128             ClassNotFoundException, InterruptedException {
    129. 129
    130. 130         Path outPath = new Path(out);
    131. 131
    132. 132         Configuration conf = new Configuration();
    133. 133        
    134. 134         //前K个词要输出到哪个目录
    135. 135         conf.set("topKout",topKout);
    136. 136        
    137. 137         Job job = new Job(conf, "Sort");
    138. 138         job.setJarByClass(Sort.class);
    139. 139         job.setMapperClass(Map.class);
    140. 140         job.setReducerClass(Reduce.class);
    141. 141
    142. 142         // 设置Map输出类型
    143. 143         job.setMapOutputKeyClass(IntWritable.class);
    144. 144         job.setMapOutputValueClass(Text.class);
    145. 145
    146. 146         // 设置Reduce输出类型
    147. 147         job.setOutputKeyClass(Text.class);
    148. 148         job.setOutputValueClass(IntWritable.class);
    149. 149
    150. 150         //设置MultipleOutputs的输出格式
    151. 151         //这里利用MultipleOutputs进行对文件输出
    152. 152         MultipleOutputs.addNamedOutput(job,"topKMOS",TextOutputFormat.class,Text.class,Text.class);
    153. 153        
    154. 154         // 设置输入和输出目录
    155. 155         FileInputFormat.addInputPath(job, new Path(in));
    156. 156         FileOutputFormat.setOutputPath(job, outPath);
    157. 157         job.waitForCompletion(true);
    158. 158
    159. 159     }
    160. 160
    161. 161 }

    自己封装的Int

    01. 1 package TopK;
    02. 2
    03. 3 public class MyInt implements Comparable<MyInt>{
    04. 4     private Integer value;
    05. 5
    06. 6     public MyInt(Integer value){
    07. 7         this.value = value;
    08. 8     }
    09. 9    
    10. 10     public int getValue() {
    11. 11         return value;
    12. 12     }
    13. 13
    14. 14     public void setValue(int value) {
    15. 15         this.value = value;
    16. 16     }
    17. 17
    18. 18     @Override
    19. 19     public int compareTo(MyInt o) {
    20. 20         return value.compareTo(o.getValue());
    21. 21     }
    22. 22    
    23. 23    
    24. 24 }

    运行入口

    01. 1 package TopK;
    02. 2
    03. 3 import java.io.IOException;
    04. 4
    05. 5 /**
    06. 6  *
    07. 7  * @author zx
    08. 8  *zhangxian1991@qq.com
    09. 9  */
    10. 10 public class TopK {
    11. 11     public static void main(String args[]) throws ClassNotFoundException, IOException, InterruptedException{
    12. 12        
    13. 13         //要统计字数,排序的文字
    14. 14         String in = "hdfs://localhost:9000/input/MaDing.text";
    15. 15        
    16. 16         //统计字数后的结果
    17. 17         String <a href="http://www.it165.net/edu/ebg/" target="_blank" class="keylink">word</a>Cout ="hdfs://localhost:9000/out/wordCout";
    18. 18        
    19. 19         //对统计完后的结果再排序后的内容
    20. 20         String sort = "hdfs://localhost:9000/out/sort";
    21. 21        
    22. 22         //前K条
    23. 23         String topK = "hdfs://localhost:9000/out/topK";
    24. 24        
    25. 25         //如果统计字数的job完成后就开始排序
    26. 26         if(WordCount.run(in, wordCout)){
    27. 27             Sort.run(wordCout, sort,topK);
    28. 28         }
    29. 29        
    30. 30     }
    31. 31 }
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值