##运行wordcount单词频率统计程序,基于输出输出路径。
[hadoop@hadoop-master hadoop-1.2.1]$ hadoop jar hadoop-examples-1.2.1.jar wordcount /wc/mininput /wc/minoutput
##告知输入路径下有几个文件需要处理
16/03/14 05:08:59 INFO input.FileInputFormat: Total input paths to process : 2
##告知本地的snappy压缩算法的库是可用的,默认情况下,Linux是没有相应的库的
16/03/14 05:08:59 INFO util.NativeCodeLoader: Loaded the native-hadoop library
##加载本地的Hadoop库文件,默认情况下,在hadoop1.x中存放在$HADOOP_HOME/c++/Linux-amd64-64/lib(针对64bitLinux操作系统)
16/03/14 05:08:59 WARN snappy.LoadSnappy: Snappy native library not loaded
##运行Job的相关进度信息
###加载本地的snappy压缩算法的库,默认情况下,Linux是没有相应的库的,当没有配置时不显示
###运行Job的ID
16/03/14 05:08:59 INFO mapred.JobClient: Running job: job_201603140438_0001
###Job运行时,Map task 和 Reduce task的运行进度
16/03/14 05:09:00 INFO mapred.JobClient: map 0% reduce 0%
16/03/14 05:09:12 INFO mapred.JobClient: map 100% reduce 0%
16/03/14 05:09:21 INFO mapred.JobClient: map 100% reduce 33%
16/03/14 05:09:23 INFO mapred.JobClient: map 100% reduce 100%
###Job运行完成
16/03/14 05:09:24 INFO mapred.JobClient: Job complete: job_201603140438_0001
##显示整个Job运行过程中,各类计算器Counter的值
##总共有29中计数器Counter
16/03/14 05:09:24 INFO mapred.JobClient: Counters: 29
16/03/14 05:09:24 INFO mapred.JobClient: Map-Reduce Framework ##计数器个数:16
16/03/14 05:09:24 INFO mapred.JobClient: Spilled Records=66
16/03/14 05:09:24 INFO mapred.JobClient: Map output materialized bytes=440
16/03/14 05:09:24 INFO mapred.JobClient: Reduce input records=33
16/03/14 05:09:24 INFO mapred.JobClient: Virtual memory (bytes) snapshot=5798617088
16/03/14 05:09:24 INFO mapred.JobClient: Map input records=18
16/03/14 05:09:24 INFO mapred.JobClient: SPLIT_RAW_BYTES=244
16/03/14 05:09:24 INFO mapred.JobClient: Map output bytes=395
16/03/14 05:09:24 INFO mapred.JobClient: Reduce shuffle bytes=440
16/03/14 05:09:24 INFO mapred.JobClient: Physical memory (bytes) snapshot=411492352
16/03/14 05:09:24 INFO mapred.JobClient: Reduce input groups=33
16/03/14 05:09:24 INFO mapred.JobClient: Combine output records=33
16/03/14 05:09:24 INFO mapred.JobClient: Reduce output records=33
16/03/14 05:09:24 INFO mapred.JobClient: Map output records=37
16/03/14 05:09:24 INFO mapred.JobClient: Combine input records=37
16/03/14 05:09:24 INFO mapred.JobClient: CPU time spent (ms)=2100
16/03/14 05:09:24 INFO mapred.JobClient: Total committed heap usage (bytes)=337780736
16/03/14 05:09:24 INFO mapred.JobClient: File Input Format Counters ##计数器个数:1
16/03/14 05:09:24 INFO mapred.JobClient: Bytes Read=261
16/03/14 05:09:24 INFO mapred.JobClient: FileSystemCounters ##计数器个数:4
16/03/14 05:09:24 INFO mapred.JobClient: HDFS_BYTES_READ=505
16/03/14 05:09:24 INFO mapred.JobClient: FILE_BYTES_WRITTEN=164155
16/03/14 05:09:24 INFO mapred.JobClient: FILE_BYTES_READ=434
16/03/14 05:09:24 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=296
16/03/14 05:09:24 INFO mapred.JobClient: Job Counters ##计数器个数:7
16/03/14 05:09:24 INFO mapred.JobClient: Launched map tasks=2
16/03/14 05:09:24 INFO mapred.JobClient: Launched reduce tasks=1
16/03/14 05:09:24 INFO mapred.JobClient: SLOTS_MILLIS_REDUCES=10635
16/03/14 05:09:24 INFO mapred.JobClient: Total time spent by all reduces waiting after reserving slots (ms)=0
16/03/14 05:09:24 INFO mapred.JobClient: SLOTS_MILLIS_MAPS=18759
16/03/14 05:09:24 INFO mapred.JobClient: Total time spent by all maps waiting after reserving slots (ms)=0
16/03/14 05:09:24 INFO mapred.JobClient: Data-local map tasks=2
16/03/14 05:09:24 INFO mapred.JobClient: File Output Format Counters ##计数器个数:1
16/03/14 05:09:24 INFO mapred.JobClient: Bytes Written=296
1 package org.dragon.hadoop.mapreduce.counter; 2 3 import java.io.IOException; 4 import java.util.StringTokenizer; 5 6 import org.apache.hadoop.conf.Configuration; 7 import org.apache.hadoop.fs.Path; 8 import org.apache.hadoop.io.IntWritable; 9 import org.apache.hadoop.io.LongWritable; 10 import org.apache.hadoop.io.Text; 11 import org.apache.hadoop.mapreduce.Job; 12 import org.apache.hadoop.mapreduce.Mapper; 13 import org.apache.hadoop.mapreduce.Reducer; 14 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 15 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 16 import org.apache.hadoop.util.GenericOptionsParser; 17 18 /** 19 * function:在wordcount中自定义计数器样例 20 * @author ZhuXY 21 * @time 2016-3-14 上午10:48:29 22 * 23 */ 24 public class DIYCounter { 25 static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> { 26 27 private Text word = new Text(); 28 private final static IntWritable one = new IntWritable(1); 29 30 // 快捷键alt+shift+s 31 // map方法每次只读取一行数据,换句话说就是每行启动一个map函数 32 @Override 33 protected void map(LongWritable key, Text value, Context context) 34 throws IOException, InterruptedException { 35 36 //############################################################### 37 context.getCounter("DIYCOUTER_MAP_REDUCE", "MAP_INPUT_KEYVALUES") 38 .increment(1L); 39 //############################################################### 40 41 // 获取每行数据的值 42 String lineValue = value.toString(); 43 44 // 进行分割 45 StringTokenizer stringTokenizer = new StringTokenizer(lineValue); 46 47 // 遍历 48 while (stringTokenizer.hasMoreElements()) { 49 50 // 获取每个值 51 String worldValue = stringTokenizer.nextToken(); 52 53 // 设置map, 输入的key值 54 word.set(worldValue); 55 context.write(word, one); // 如果出现就出现一次,存在每行出现几次,这时候键的值一样,多个键值对 56 } 57 } 58 } 59 60 // Reducer 区域 61 static class MyReducer extends 62 Reducer<Text, IntWritable, Text, IntWritable> { 63 64 private IntWritable resultIntWritable = new IntWritable(); 65 66 // 这里key还是key。但是value变成了values 67 @Override 68 protected void reduce(Text key, Iterable<IntWritable> values, 69 Context context) throws IOException, InterruptedException { 70 71 context.getCounter("DIYCOUTER_MAP_REDUCE", "REDUCE_INPUT_KEYVALUES") 72 .increment(1L); 73 74 // 用于累加的变量 75 int sum = 0; 76 // 循环遍历Interable 77 for (IntWritable value : values) { 78 // 累加 79 sum += value.get(); 80 } 81 82 // 设置总次数 83 resultIntWritable.set(sum); 84 context.getCounter("DIYCOUTER_MAP_REDUCE", "REDUCE_OUTPUT_KEYVALUES") 85 .increment(1L); 86 context.write(key, resultIntWritable); 87 } 88 } 89 90 // Client 区域 91 public static void main(String[] args) throws Exception { 92 93 // 获取配置文件信息 94 Configuration configuration = new Configuration(); 95 96 // 配置输入输出文件路径 97 args = new String[] { 98 "hdfs://hadoop-master.dragon.org:9000/wc/mininput", 99 "hdfs://hadoop-master.dragon.org:9000/wc/wcoutput" }; 100 101 // 当命令格式不正确的时候,提示,类似于shell中的命令提示 102 String[] otherArgs = new GenericOptionsParser(configuration, args) 103 .getRemainingArgs(); 104 if (otherArgs.length != 2) { 105 System.err.println("Usage: wordcount <in> <out>"); 106 System.exit(2); 107 } 108 109 // 创建job。设置配置文件信息和Job名称 110 Job job = new Job(configuration, "wc"); 111 112 // 1、设置Job运行的类 113 job.setJarByClass(DIYCounter.class); 114 115 // 2、设置Mapper类和Reducer类 116 job.setMapperClass(MyMapper.class); 117 job.setReducerClass(MyReducer.class); 118 119 // 3、设置输入文件的目录和输出文件目录 120 FileInputFormat.addInputPath(job, new Path(args[0])); 121 FileOutputFormat.setOutputPath(job, new Path(args[1])); 122 123 // 4、设置输出结果的key和value的类型 124 job.setOutputKeyClass(Text.class); 125 job.setOutputValueClass(IntWritable.class); 126 127 // 5、提交Job等待运行结果,并在客户端显示运行信息 128 boolean isSuccess = job.waitForCompletion(true); 129 130 // 6、结束程序 131 System.exit(isSuccess ? 0 : 1); 132 } 133 }