package com.mapReduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
import java.io.IOException;
/**
* MapReduce程序入门示例:wordCount
* */
public class WordCountMR {
/**
* main()方法作为该mapReduce程序的入口
* */
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//指定hdfs相关的参数
Configuration conf=new Configuration();
conf.set("fs.defaultFS","hdfs://hadMaster01:9000");
System.setProperty("HADOOP_USER_NAME","hadoop");
//通过Configuration对象获取job对象,该job对象会组织所有的该MapReduce程序所需各种组件
Job job= Job.getInstance(conf);
//设置jar包所在路径
job.setJarByClass(WordCountMR.class);
//指定mapper类和reducer类
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
//Mapper的输入key-value类型,由MapReduce框架决定,下面指定mapTask的输出类型
job.setMapOutputKeyClass(Text.class); //Text====>String
job.setMapOutputValueClass(IntWritable.class); //IntWritable===>Integer
//如果mapTask的输出key-value类型,同reduceTask的输出key-value类型一致,那么以上两句代码可以不用设置
//reduceTask的输入key-value类型,就是mapTask的输出key-value类型,所以不需要指定。
//指定reduceTask的输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//为job指定输入和输出数据的组件,以下两个参数是默认的,所以不指定也可以
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
//为该mapreduce程序制定默认的数据分区组件,默认是HashPartitioner.class
job.setPartitionerClass(HashPartitioner.class);
//设置mapReduce程序的ReduceTask的个数
job.setNumReduceTasks(3);
//指定该mapReduce程序数据的输入和输出路径
Path inputPath=new Path("/input");
Path outputPath=new Path("/output");
//该段代码是用来判断输出路径存不存在,存在就删除,虽然方便使用,但是谨慎使用
FileSystem fs=FileSystem.get(conf);
if(fs.exists(outputPath)){
fs.delete(outputPath,true);
}
//设置wordcount程序的输入路径
FileInputFormat.setInputPaths(job,inputPath);
//设置wordcount程序的输出路径
FileOutputFormat.setOutputPath(job,outputPath);
//最后提交任务,(verbose布尔值,决定要不要将运行进度信息输出给用户)
boolean waitForCompletion=job.waitForCompletion(true);
System.exit(waitForCompletion?0:1);
}
/**
* Mapper<keyIn,ValueIn,keyOut,valueOut>
* keyIn:指框架读取到的数据的key的类型,在默认的InputFormat下,读到的key是一行文本的其实偏移量,所以key的类型是Long
* valueIn:指框架读取到的数据的value的类型,在默认的InputFormat下,读到的value是一行文本的内容,所以value的类型是String
* keyOut:指用户自定义逻辑方法返回的数据中key的类型,由用户业务逻辑决定,在此wordcount程序中,我们输出的key是单词,所以是String
* valueOut:指用户自定义逻辑方法返回的数据中value的类型,由用户业务逻辑决定,在此wordcount程序中,我们输出的value是单词的数量,所以是Integer
* 但是String,Long等jdk中自带的数据类型,在序列化时,效率比较低,hadoop为了提高序列化效率,自定义了一套序列化框架
* 所以,在hadoop的程序中,如果该数据需要进行序列化(读写磁盘或者网络传输),就一定要用实现了hadoop序列化框架的数据类型
* Long--->LongWritable
* String--->Text
* Integer-->IntWritable
* Null----->NullWritable
* */
static class WordCountMapper extends Mapper<LongWritable,Text,Text,IntWritable>{
/**
* LongWritable key:该key就是value在文件当中的起始偏移量
* Text value:就是mapReduce框架默认的数据读取组件TextInputFormat读取文件当中的一行文本
* */
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//切分单词
String[] words=value.toString().split("");
for(String word:words){
//每个单词计数一次,也就是把单词组织成<hello,1>这样的key-value对 往外写出
context.write(new Text(word),new IntWritable(1));
}
}
}
/**
*Reducer类也有输入和输出,输入就是map阶段的处理结果,输出就是Reduce最后的输出
* reduceTask在调用我们写的reduce方法,reduceTask应该收到了前一阶段(map阶段)中所有maptask输出的数据中的一部分
* (数据的key.hashcode%reducetask数==本reducetask号),所以reducetasks的输入类型必须和maptask的输出类型一样
*
* reducetask将这些收到kv数据拿来处理时,是这样调用我们的reduce方法的:
* 先将自己收到的所有的kv对按照k分组(根据k是否相同)
* 将某一组kv中的第一个kv中的k传给reduce方法的key变量,把这一组kv中的所有的v用一个迭代器传给reduce方法的变量value
* */
static class WordCountReducer extends Reducer<Text,IntWritable,Text,IntWritable>{
/**
* Text key:mapTask输出的key值
* Iterable<IntWritable> values: key对应的value的集合(该key只是相同的一个key)
* reduce方法接收key值相同的一组key-value进行汇总计算
* */
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
//结果汇总
int sum=0;
for(IntWritable v:values){
sum +=v.get();
}
//汇总的结果往外输出
context.write(key,new IntWritable(sum));
}
}
}
程序执行结果:
D:\developTool\jdk\jdk8\bin\java "-javaagent:D:\developTool\idea2017\IntelliJ IDEA 2017.3\lib\idea_rt.jar=53284:D:\developTool\idea2017\IntelliJ IDEA 2017.3\bin" -Dfile.encoding=UTF-8 -classpath F:\downloads\demo-provider\hadoop_example\target\classes;D:\developTool\jdk\jdk8\jre\lib\charsets.jar;D:\developTool\jdk\jdk8\jre\lib\deploy.jar;D:\developTool\jdk\jdk8\jre\lib\ext\access-bridge-64.jar;D:\developTool\jdk\jdk8\jre\lib\ext\cldrdata.jar;D:\developTool\jdk\jdk8\jre\lib\ext\dnsns.jar;D:\developTool\jdk\jdk8\jre\lib\ext\jaccess.jar;D:\developTool\jdk\jdk8\jre\lib\ext\jfxrt.jar;D:\developTool\jdk\jdk8\jre\lib\ext\localedata.jar;D:\developTool\jdk\jdk8\jre\lib\ext\nashorn.jar;D:\developTool\jdk\jdk8\jre\lib\ext\sunec.jar;D:\developTool\jdk\jdk8\jre\lib\ext\sunjce_provider.jar;D:\developTool\jdk\jdk8\jre\lib\ext\sunmscapi.jar;D:\developTool\jdk\jdk8\jre\lib\ext\sunpkcs11.jar;D:\developTool\jdk\jdk8\jre\lib\ext\zipfs.jar;D:\developTool\jdk\jdk8\jre\lib\javaws.jar;D:\developTool\jdk\jdk8\jre\lib\jce.jar;D:\developTool\jdk\jdk8\jre\lib\jfr.jar;D:\developTool\jdk\jdk8\jre\lib\jfxswt.jar;D:\developTool\jdk\jdk8\jre\lib\jsse.jar;D:\developTool\jdk\jdk8\jre\lib\management-agent.jar;D:\developTool\jdk\jdk8\jre\lib\plugin.jar;D:\developTool\jdk\jdk8\jre\lib\resources.jar;D:\developTool\jdk\jdk8\jre\lib\rt.jar;D:\developTool\hadoop-2.9.1\share\hadoop\httpfs;D:\developTool\hadoop-2.9.1\share\hadoop\common\hadoop-nfs-2.9.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\hadoop-common-2.9.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\hadoop-common-2.9.1-tests.jar;D:\developTool\hadoop-2.9.1\share\hadoop\hdfs\hadoop-hdfs-2.9.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\hdfs\hadoop-hdfs-nfs-2.9.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\hdfs\hadoop-hdfs-rbf-2.9.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\hdfs\hadoop-hdfs-2.9.1-tests.jar;D:\developTool\hadoop-2.9.1\share\hadoop\hdfs\hadoop-hdfs-client-2.9.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\hdfs\hadoop-hdfs-rbf-2.9.1-tests.jar;D:\developTool\hadoop-2.9.1\share\hadoop\hdfs\hadoop-hdfs-client-2.9.1-tests.jar;D:\developTool\hadoop-2.9.1\share\hadoop\hdfs\hadoop-hdfs-native-client-2.9.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\hdfs\hadoop-hdfs-native-client-2.9.1-tests.jar;D:\developTool\hadoop-2.9.1\share\hadoop\mapreduce\hadoop-mapreduce-examples-2.9.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\mapreduce\hadoop-mapreduce-client-hs-2.9.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\mapreduce\hadoop-mapreduce-client-app-2.9.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\mapreduce\hadoop-mapreduce-client-core-2.9.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\mapreduce\hadoop-mapreduce-client-common-2.9.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\mapreduce\hadoop-mapreduce-client-shuffle-2.9.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\mapreduce\hadoop-mapreduce-client-jobclient-2.9.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\mapreduce\hadoop-mapreduce-client-hs-plugins-2.9.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\mapreduce\hadoop-mapreduce-client-jobclient-2.9.1-tests.jar;D:\developTool\hadoop-2.9.1\share\hadoop\yarn\hadoop-yarn-ui-2.9.1.war;D:\developTool\hadoop-2.9.1\share\hadoop\yarn\hadoop-yarn-api-2.9.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\yarn\hadoop-yarn-client-2.9.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\yarn\hadoop-yarn-common-2.9.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\yarn\hadoop-yarn-registry-2.9.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\yarn\hadoop-yarn-server-tests-2.9.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\yarn\hadoop-yarn-server-common-2.9.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\yarn\hadoop-yarn-server-router-2.9.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\yarn\hadoop-yarn-server-web-proxy-2.9.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\yarn\hadoop-yarn-server-nodemanager-2.9.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\yarn\hadoop-yarn-server-resourcemanager-2.9.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\yarn\hadoop-yarn-server-sharedcachemanager-2.9.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\yarn\hadoop-yarn-applications-distributedshell-2.9.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\yarn\hadoop-yarn-server-timeline-pluginstorage-2.9.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\yarn\hadoop-yarn-server-applicationhistoryservice-2.9.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\yarn\hadoop-yarn-applications-unmanaged-am-launcher-2.9.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\woodstox-core-5.0.3.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\slf4j-api-1.7.25.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\slf4j-log4j12-1.7.25.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\log4j-1.2.17.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\stax2-api-3.1.4.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\activation-1.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\apacheds-i18n-2.0.0-M15.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\apacheds-kerberos-codec-2.0.0-M15.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\api-asn1-api-1.0.0-M20.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\api-util-1.0.0-M20.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\asm-3.2.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\avro-1.7.7.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\commons-beanutils-1.7.0.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\commons-beanutils-core-1.8.0.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\commons-cli-1.2.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\commons-codec-1.4.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\commons-collections-3.2.2.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\commons-compress-1.4.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\commons-configuration-1.6.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\commons-digester-1.8.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\commons-io-2.4.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\commons-lang-2.6.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\commons-lang3-3.4.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\commons-logging-1.1.3.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\commons-math3-3.1.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\commons-net-3.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\curator-client-2.7.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\curator-framework-2.7.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\curator-recipes-2.7.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\gson-2.2.4.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\guava-11.0.2.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\hadoop-annotations-2.9.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\hadoop-auth-2.9.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\hamcrest-core-1.3.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\htrace-core-3.0.4.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\htrace-core4-4.1.0-incubating.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\httpclient-4.5.2.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\httpcore-4.4.4.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\jackson-core-asl-1.9.13.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\jackson-jaxrs-1.9.13.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\jackson-mapper-asl-1.9.13.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\jackson-xc-1.9.13.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\java-xmlbuilder-0.4.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\jaxb-api-2.2.2.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\jaxb-impl-2.2.3-1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\jcip-annotations-1.0-1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\jersey-core-1.9.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\jersey-json-1.9.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\jersey-server-1.9.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\jets3t-0.9.0.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\jettison-1.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\jetty-6.1.26.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\jetty-sslengine-6.1.26.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\jetty-util-6.1.26.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\jsch-0.1.54.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\json-smart-1.3.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\jsp-api-2.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\jsr305-3.0.0.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\junit-4.11.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\mockito-all-1.8.5.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\netty-3.6.2.Final.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\nimbus-jose-jwt-4.41.1.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\paranamer-2.3.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\protobuf-java-2.5.0.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\servlet-api-2.5.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\snappy-java-1.0.5.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\stax-api-1.0-2.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\xmlenc-0.52.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\xz-1.0.jar;D:\developTool\hadoop-2.9.1\share\hadoop\common\lib\zookeeper-3.4.6.jar com.mapReduce.WordCountMR
20/07/13 17:06:47 INFO Configuration.deprecation: session.id is deprecated. Instead, use dfs.metrics.session-id
20/07/13 17:06:47 INFO jvm.JvmMetrics: Initializing JVM Metrics with processName=JobTracker, sessionId=
20/07/13 17:06:57 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
20/07/13 17:06:58 WARN mapreduce.JobResourceUploader: No job jar file set. User classes may not be found. See Job or Job#setJar(String).
20/07/13 17:07:00 INFO input.FileInputFormat: Total input files to process : 3
20/07/13 17:07:01 INFO mapreduce.JobSubmitter: number of splits:3
20/07/13 17:07:04 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_local508204586_0001
20/07/13 17:07:05 INFO mapreduce.Job: The url to track the job: http://localhost:8080/
20/07/13 17:07:05 INFO mapreduce.Job: Running job: job_local508204586_0001
20/07/13 17:07:05 INFO mapred.LocalJobRunner: OutputCommitter set in config null
20/07/13 17:07:05 INFO output.FileOutputCommitter: File Output Committer Algorithm version is 1
20/07/13 17:07:05 INFO output.FileOutputCommitter: FileOutputCommitter skip cleanup _temporary folders under output directory:false, ignore cleanup failures: false
20/07/13 17:07:05 INFO mapred.LocalJobRunner: OutputCommitter is org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
20/07/13 17:07:06 INFO mapreduce.Job: Job job_local508204586_0001 running in uber mode : false
20/07/13 17:07:06 INFO mapreduce.Job: map 0% reduce 0%
20/07/13 17:07:07 INFO mapred.LocalJobRunner: Waiting for map tasks
20/07/13 17:07:07 INFO mapred.LocalJobRunner: Starting task: attempt_local508204586_0001_m_000000_0
20/07/13 17:07:07 INFO output.FileOutputCommitter: File Output Committer Algorithm version is 1
20/07/13 17:07:07 INFO output.FileOutputCommitter: FileOutputCommitter skip cleanup _temporary folders under output directory:false, ignore cleanup failures: false
20/07/13 17:07:07 INFO util.ProcfsBasedProcessTree: ProcfsBasedProcessTree currently is supported only on Linux.
20/07/13 17:07:07 INFO mapred.Task: Using ResourceCalculatorProcessTree : org.apache.hadoop.yarn.util.WindowsBasedProcessTree@114770b0
20/07/13 17:07:07 INFO mapred.MapTask: Processing split: hdfs://hadMaster01:9000/input/wc.txt:0+83
20/07/13 17:07:07 INFO mapred.MapTask: (EQUATOR) 0 kvi 26214396(104857584)
20/07/13 17:07:07 INFO mapred.MapTask: mapreduce.task.io.sort.mb: 100
20/07/13 17:07:07 INFO mapred.MapTask: soft limit at 83886080
20/07/13 17:07:07 INFO mapred.MapTask: bufstart = 0; bufvoid = 104857600
20/07/13 17:07:07 INFO mapred.MapTask: kvstart = 26214396; length = 6553600
20/07/13 17:07:07 INFO mapred.MapTask: Map output collector class = org.apache.hadoop.mapred.MapTask$MapOutputBuffer
20/07/13 17:07:18 INFO mapred.LocalJobRunner:
20/07/13 17:07:18 INFO mapred.MapTask: Starting flush of map output
20/07/13 17:07:18 INFO mapred.MapTask: Spilling map output
20/07/13 17:07:18 INFO mapred.MapTask: bufstart = 0; bufend = 450; bufvoid = 104857600
20/07/13 17:07:18 INFO mapred.MapTask: kvstart = 26214396(104857584); kvend = 26214100(104856400); length = 297/6553600
20/07/13 17:07:19 INFO mapred.MapTask: Finished spill 0
20/07/13 17:07:19 INFO mapred.Task: Task:attempt_local508204586_0001_m_000000_0 is done. And is in the process of committing
20/07/13 17:07:19 INFO mapred.LocalJobRunner: map
20/07/13 17:07:19 INFO mapred.Task: Task 'attempt_local508204586_0001_m_000000_0' done.
20/07/13 17:07:19 INFO mapred.LocalJobRunner: Finishing task: attempt_local508204586_0001_m_000000_0
20/07/13 17:07:19 INFO mapred.LocalJobRunner: Starting task: attempt_local508204586_0001_m_000001_0
20/07/13 17:07:19 INFO output.FileOutputCommitter: File Output Committer Algorithm version is 1
20/07/13 17:07:19 INFO output.FileOutputCommitter: FileOutputCommitter skip cleanup _temporary folders under output directory:false, ignore cleanup failures: false
20/07/13 17:07:19 INFO util.ProcfsBasedProcessTree: ProcfsBasedProcessTree currently is supported only on Linux.
20/07/13 17:07:19 INFO mapred.Task: Using ResourceCalculatorProcessTree : org.apache.hadoop.yarn.util.WindowsBasedProcessTree@656d8781
20/07/13 17:07:19 INFO mapred.MapTask: Processing split: hdfs://hadMaster01:9000/input/hello.txt:0+14
20/07/13 17:07:19 INFO mapred.MapTask: (EQUATOR) 0 kvi 26214396(104857584)
20/07/13 17:07:19 INFO mapred.MapTask: mapreduce.task.io.sort.mb: 100
20/07/13 17:07:19 INFO mapred.MapTask: soft limit at 83886080
20/07/13 17:07:19 INFO mapred.MapTask: bufstart = 0; bufvoid = 104857600
20/07/13 17:07:19 INFO mapred.MapTask: kvstart = 26214396; length = 6553600
20/07/13 17:07:19 INFO mapred.MapTask: Map output collector class = org.apache.hadoop.mapred.MapTask$MapOutputBuffer
20/07/13 17:07:19 INFO mapred.LocalJobRunner:
20/07/13 17:07:19 INFO mapred.MapTask: Starting flush of map output
20/07/13 17:07:19 INFO mapred.MapTask: Spilling map output
20/07/13 17:07:19 INFO mapred.MapTask: bufstart = 0; bufend = 82; bufvoid = 104857600
20/07/13 17:07:19 INFO mapred.MapTask: kvstart = 26214396(104857584); kvend = 26214352(104857408); length = 45/6553600
20/07/13 17:07:19 INFO mapred.MapTask: Finished spill 0
20/07/13 17:07:19 INFO mapred.Task: Task:attempt_local508204586_0001_m_000001_0 is done. And is in the process of committing
20/07/13 17:07:19 INFO mapred.LocalJobRunner: map
20/07/13 17:07:19 INFO mapred.Task: Task 'attempt_local508204586_0001_m_000001_0' done.
20/07/13 17:07:19 INFO mapred.LocalJobRunner: Finishing task: attempt_local508204586_0001_m_000001_0
20/07/13 17:07:19 INFO mapred.LocalJobRunner: Starting task: attempt_local508204586_0001_m_000002_0
20/07/13 17:07:19 INFO output.FileOutputCommitter: File Output Committer Algorithm version is 1
20/07/13 17:07:19 INFO output.FileOutputCommitter: FileOutputCommitter skip cleanup _temporary folders under output directory:false, ignore cleanup failures: false
20/07/13 17:07:19 INFO util.ProcfsBasedProcessTree: ProcfsBasedProcessTree currently is supported only on Linux.
20/07/13 17:07:19 INFO mapred.Task: Using ResourceCalculatorProcessTree : org.apache.hadoop.yarn.util.WindowsBasedProcessTree@75c726f0
20/07/13 17:07:19 INFO mapred.MapTask: Processing split: hdfs://hadMaster01:9000/input/hw.txt:0+14
20/07/13 17:07:19 INFO mapred.MapTask: (EQUATOR) 0 kvi 26214396(104857584)
20/07/13 17:07:19 INFO mapred.MapTask: mapreduce.task.io.sort.mb: 100
20/07/13 17:07:19 INFO mapred.MapTask: soft limit at 83886080
20/07/13 17:07:19 INFO mapred.MapTask: bufstart = 0; bufvoid = 104857600
20/07/13 17:07:19 INFO mapred.MapTask: kvstart = 26214396; length = 6553600
20/07/13 17:07:19 INFO mapred.MapTask: Map output collector class = org.apache.hadoop.mapred.MapTask$MapOutputBuffer
20/07/13 17:07:19 INFO mapred.LocalJobRunner:
20/07/13 17:07:19 INFO mapred.MapTask: Starting flush of map output
20/07/13 17:07:19 INFO mapred.MapTask: Spilling map output
20/07/13 17:07:19 INFO mapred.MapTask: bufstart = 0; bufend = 82; bufvoid = 104857600
20/07/13 17:07:19 INFO mapred.MapTask: kvstart = 26214396(104857584); kvend = 26214352(104857408); length = 45/6553600
20/07/13 17:07:19 INFO mapreduce.Job: map 67% reduce 0%
20/07/13 17:07:19 INFO mapred.MapTask: Finished spill 0
20/07/13 17:07:20 INFO mapred.Task: Task:attempt_local508204586_0001_m_000002_0 is done. And is in the process of committing
20/07/13 17:07:20 INFO mapred.LocalJobRunner: map
20/07/13 17:07:20 INFO mapred.Task: Task 'attempt_local508204586_0001_m_000002_0' done.
20/07/13 17:07:20 INFO mapred.LocalJobRunner: Finishing task: attempt_local508204586_0001_m_000002_0
20/07/13 17:07:20 INFO mapred.LocalJobRunner: map task executor complete.
20/07/13 17:07:20 INFO mapred.LocalJobRunner: Waiting for reduce tasks
20/07/13 17:07:20 INFO mapred.LocalJobRunner: Starting task: attempt_local508204586_0001_r_000000_0
20/07/13 17:07:20 INFO output.FileOutputCommitter: File Output Committer Algorithm version is 1
20/07/13 17:07:20 INFO output.FileOutputCommitter: FileOutputCommitter skip cleanup _temporary folders under output directory:false, ignore cleanup failures: false
20/07/13 17:07:20 INFO util.ProcfsBasedProcessTree: ProcfsBasedProcessTree currently is supported only on Linux.
20/07/13 17:07:20 INFO mapred.Task: Using ResourceCalculatorProcessTree : org.apache.hadoop.yarn.util.WindowsBasedProcessTree@3b089a89
20/07/13 17:07:20 INFO mapred.ReduceTask: Using ShuffleConsumerPlugin: org.apache.hadoop.mapreduce.task.reduce.Shuffle@1f1b7038
20/07/13 17:07:20 INFO reduce.MergeManagerImpl: MergerManager: memoryLimit=1311663744, maxSingleShuffleLimit=327915936, mergeThreshold=865698112, ioSortFactor=10, memToMemMergeOutputsThreshold=10
20/07/13 17:07:20 INFO reduce.EventFetcher: attempt_local508204586_0001_r_000000_0 Thread started: EventFetcher for fetching Map Completion Events
20/07/13 17:07:20 INFO reduce.LocalFetcher: localfetcher#1 about to shuffle output of map attempt_local508204586_0001_m_000000_0 decomp: 218 len: 222 to MEMORY
20/07/13 17:07:20 INFO reduce.InMemoryMapOutput: Read 218 bytes from map-output for attempt_local508204586_0001_m_000000_0
20/07/13 17:07:20 INFO reduce.MergeManagerImpl: closeInMemoryFile -> map-output of size: 218, inMemoryMapOutputs.size() -> 1, commitMemory -> 0, usedMemory ->218
20/07/13 17:07:20 INFO mapreduce.Job: map 100% reduce 0%
20/07/13 17:07:21 INFO reduce.LocalFetcher: localfetcher#1 about to shuffle output of map attempt_local508204586_0001_m_000002_0 decomp: 10 len: 14 to MEMORY
20/07/13 17:07:21 INFO reduce.InMemoryMapOutput: Read 10 bytes from map-output for attempt_local508204586_0001_m_000002_0
20/07/13 17:07:21 INFO reduce.MergeManagerImpl: closeInMemoryFile -> map-output of size: 10, inMemoryMapOutputs.size() -> 2, commitMemory -> 218, usedMemory ->228
20/07/13 17:07:21 INFO reduce.LocalFetcher: localfetcher#1 about to shuffle output of map attempt_local508204586_0001_m_000001_0 decomp: 10 len: 14 to MEMORY
20/07/13 17:07:21 INFO reduce.InMemoryMapOutput: Read 10 bytes from map-output for attempt_local508204586_0001_m_000001_0
20/07/13 17:07:21 INFO reduce.MergeManagerImpl: closeInMemoryFile -> map-output of size: 10, inMemoryMapOutputs.size() -> 3, commitMemory -> 228, usedMemory ->238
20/07/13 17:07:21 INFO reduce.EventFetcher: EventFetcher is interrupted.. Returning
20/07/13 17:07:21 INFO mapred.LocalJobRunner: 3 / 3 copied.
20/07/13 17:07:21 INFO reduce.MergeManagerImpl: finalMerge called with 3 in-memory map-outputs and 0 on-disk map-outputs
20/07/13 17:07:21 INFO mapred.Merger: Merging 3 sorted segments
20/07/13 17:07:21 INFO mapred.Merger: Down to the last merge-pass, with 3 segments left of total size: 226 bytes
20/07/13 17:07:21 INFO reduce.MergeManagerImpl: Merged 3 segments, 238 bytes to disk to satisfy reduce memory limit
20/07/13 17:07:21 INFO reduce.MergeManagerImpl: Merging 1 files, 238 bytes from disk
20/07/13 17:07:21 INFO reduce.MergeManagerImpl: Merging 0 segments, 0 bytes from memory into reduce
20/07/13 17:07:21 INFO mapred.Merger: Merging 1 sorted segments
20/07/13 17:07:21 INFO mapred.Merger: Down to the last merge-pass, with 1 segments left of total size: 230 bytes
20/07/13 17:07:21 INFO mapred.LocalJobRunner: 3 / 3 copied.
20/07/13 17:07:22 INFO Configuration.deprecation: mapred.skip.on is deprecated. Instead, use mapreduce.job.skiprecords
20/07/13 17:07:32 INFO mapred.LocalJobRunner: reduce > reduce
20/07/13 17:07:32 INFO mapreduce.Job: map 100% reduce 33%
20/07/13 17:07:39 INFO mapred.Task: Task:attempt_local508204586_0001_r_000000_0 is done. And is in the process of committing
20/07/13 17:07:39 INFO mapred.LocalJobRunner: reduce > reduce
20/07/13 17:07:39 INFO mapred.Task: Task attempt_local508204586_0001_r_000000_0 is allowed to commit now
20/07/13 17:07:39 INFO output.FileOutputCommitter: Saved output of task 'attempt_local508204586_0001_r_000000_0' to hdfs://hadMaster01:9000/output/_temporary/0/task_local508204586_0001_r_000000
20/07/13 17:07:39 INFO mapred.LocalJobRunner: reduce > reduce
20/07/13 17:07:39 INFO mapred.Task: Task 'attempt_local508204586_0001_r_000000_0' done.
20/07/13 17:07:39 INFO mapred.LocalJobRunner: Finishing task: attempt_local508204586_0001_r_000000_0
20/07/13 17:07:39 INFO mapred.LocalJobRunner: Starting task: attempt_local508204586_0001_r_000001_0
20/07/13 17:07:39 INFO output.FileOutputCommitter: File Output Committer Algorithm version is 1
20/07/13 17:07:39 INFO output.FileOutputCommitter: FileOutputCommitter skip cleanup _temporary folders under output directory:false, ignore cleanup failures: false
20/07/13 17:07:39 INFO util.ProcfsBasedProcessTree: ProcfsBasedProcessTree currently is supported only on Linux.
20/07/13 17:07:40 INFO mapred.Task: Using ResourceCalculatorProcessTree : org.apache.hadoop.yarn.util.WindowsBasedProcessTree@4c441e36
20/07/13 17:07:40 INFO mapred.ReduceTask: Using ShuffleConsumerPlugin: org.apache.hadoop.mapreduce.task.reduce.Shuffle@1c438e75
20/07/13 17:07:40 INFO reduce.MergeManagerImpl: MergerManager: memoryLimit=1311663744, maxSingleShuffleLimit=327915936, mergeThreshold=865698112, ioSortFactor=10, memToMemMergeOutputsThreshold=10
20/07/13 17:07:40 INFO reduce.EventFetcher: attempt_local508204586_0001_r_000001_0 Thread started: EventFetcher for fetching Map Completion Events
20/07/13 17:07:40 INFO reduce.LocalFetcher: localfetcher#2 about to shuffle output of map attempt_local508204586_0001_m_000000_0 decomp: 250 len: 254 to MEMORY
20/07/13 17:07:40 INFO reduce.InMemoryMapOutput: Read 250 bytes from map-output for attempt_local508204586_0001_m_000000_0
20/07/13 17:07:40 INFO reduce.MergeManagerImpl: closeInMemoryFile -> map-output of size: 250, inMemoryMapOutputs.size() -> 1, commitMemory -> 0, usedMemory ->250
20/07/13 17:07:40 INFO reduce.LocalFetcher: localfetcher#2 about to shuffle output of map attempt_local508204586_0001_m_000002_0 decomp: 36 len: 40 to MEMORY
20/07/13 17:07:40 INFO reduce.InMemoryMapOutput: Read 36 bytes from map-output for attempt_local508204586_0001_m_000002_0
20/07/13 17:07:40 INFO reduce.MergeManagerImpl: closeInMemoryFile -> map-output of size: 36, inMemoryMapOutputs.size() -> 2, commitMemory -> 250, usedMemory ->286
20/07/13 17:07:40 INFO reduce.LocalFetcher: localfetcher#2 about to shuffle output of map attempt_local508204586_0001_m_000001_0 decomp: 36 len: 40 to MEMORY
20/07/13 17:07:40 INFO reduce.InMemoryMapOutput: Read 36 bytes from map-output for attempt_local508204586_0001_m_000001_0
20/07/13 17:07:40 INFO reduce.MergeManagerImpl: closeInMemoryFile -> map-output of size: 36, inMemoryMapOutputs.size() -> 3, commitMemory -> 286, usedMemory ->322
20/07/13 17:07:40 INFO reduce.EventFetcher: EventFetcher is interrupted.. Returning
20/07/13 17:07:40 INFO mapred.LocalJobRunner: 3 / 3 copied.
20/07/13 17:07:40 INFO reduce.MergeManagerImpl: finalMerge called with 3 in-memory map-outputs and 0 on-disk map-outputs
20/07/13 17:07:40 INFO mapred.Merger: Merging 3 sorted segments
20/07/13 17:07:40 INFO mapred.Merger: Down to the last merge-pass, with 3 segments left of total size: 310 bytes
20/07/13 17:07:40 INFO reduce.MergeManagerImpl: Merged 3 segments, 322 bytes to disk to satisfy reduce memory limit
20/07/13 17:07:40 INFO reduce.MergeManagerImpl: Merging 1 files, 322 bytes from disk
20/07/13 17:07:40 INFO reduce.MergeManagerImpl: Merging 0 segments, 0 bytes from memory into reduce
20/07/13 17:07:40 INFO mapred.Merger: Merging 1 sorted segments
20/07/13 17:07:40 INFO mapred.Merger: Down to the last merge-pass, with 1 segments left of total size: 314 bytes
20/07/13 17:07:40 INFO mapred.LocalJobRunner: 3 / 3 copied.
20/07/13 17:07:49 INFO mapred.Task: Task:attempt_local508204586_0001_r_000001_0 is done. And is in the process of committing
20/07/13 17:07:49 INFO mapred.LocalJobRunner: 3 / 3 copied.
20/07/13 17:07:49 INFO mapred.Task: Task attempt_local508204586_0001_r_000001_0 is allowed to commit now
20/07/13 17:07:49 INFO output.FileOutputCommitter: Saved output of task 'attempt_local508204586_0001_r_000001_0' to hdfs://hadMaster01:9000/output/_temporary/0/task_local508204586_0001_r_000001
20/07/13 17:07:49 INFO mapred.LocalJobRunner: reduce > reduce
20/07/13 17:07:49 INFO mapred.Task: Task 'attempt_local508204586_0001_r_000001_0' done.
20/07/13 17:07:49 INFO mapred.LocalJobRunner: Finishing task: attempt_local508204586_0001_r_000001_0
20/07/13 17:07:49 INFO mapred.LocalJobRunner: Starting task: attempt_local508204586_0001_r_000002_0
20/07/13 17:07:49 INFO output.FileOutputCommitter: File Output Committer Algorithm version is 1
20/07/13 17:07:49 INFO output.FileOutputCommitter: FileOutputCommitter skip cleanup _temporary folders under output directory:false, ignore cleanup failures: false
20/07/13 17:07:49 INFO util.ProcfsBasedProcessTree: ProcfsBasedProcessTree currently is supported only on Linux.
20/07/13 17:07:50 INFO mapred.Task: Using ResourceCalculatorProcessTree : org.apache.hadoop.yarn.util.WindowsBasedProcessTree@2db8228f
20/07/13 17:07:50 INFO mapred.ReduceTask: Using ShuffleConsumerPlugin: org.apache.hadoop.mapreduce.task.reduce.Shuffle@2d88316b
20/07/13 17:07:50 INFO reduce.MergeManagerImpl: MergerManager: memoryLimit=1311663744, maxSingleShuffleLimit=327915936, mergeThreshold=865698112, ioSortFactor=10, memToMemMergeOutputsThreshold=10
20/07/13 17:07:50 INFO reduce.EventFetcher: attempt_local508204586_0001_r_000002_0 Thread started: EventFetcher for fetching Map Completion Events
20/07/13 17:07:50 INFO reduce.LocalFetcher: localfetcher#3 about to shuffle output of map attempt_local508204586_0001_m_000000_0 decomp: 138 len: 142 to MEMORY
20/07/13 17:07:50 INFO reduce.InMemoryMapOutput: Read 138 bytes from map-output for attempt_local508204586_0001_m_000000_0
20/07/13 17:07:50 INFO reduce.MergeManagerImpl: closeInMemoryFile -> map-output of size: 138, inMemoryMapOutputs.size() -> 1, commitMemory -> 0, usedMemory ->138
20/07/13 17:07:50 INFO reduce.LocalFetcher: localfetcher#3 about to shuffle output of map attempt_local508204586_0001_m_000002_0 decomp: 66 len: 70 to MEMORY
20/07/13 17:07:50 INFO reduce.InMemoryMapOutput: Read 66 bytes from map-output for attempt_local508204586_0001_m_000002_0
20/07/13 17:07:50 INFO reduce.MergeManagerImpl: closeInMemoryFile -> map-output of size: 66, inMemoryMapOutputs.size() -> 2, commitMemory -> 138, usedMemory ->204
20/07/13 17:07:50 INFO mapreduce.Job: map 100% reduce 100%
20/07/13 17:07:50 INFO reduce.LocalFetcher: localfetcher#3 about to shuffle output of map attempt_local508204586_0001_m_000001_0 decomp: 66 len: 70 to MEMORY
20/07/13 17:07:50 INFO reduce.InMemoryMapOutput: Read 66 bytes from map-output for attempt_local508204586_0001_m_000001_0
20/07/13 17:07:50 INFO reduce.MergeManagerImpl: closeInMemoryFile -> map-output of size: 66, inMemoryMapOutputs.size() -> 3, commitMemory -> 204, usedMemory ->270
20/07/13 17:07:50 INFO reduce.EventFetcher: EventFetcher is interrupted.. Returning
20/07/13 17:07:50 INFO mapred.LocalJobRunner: 3 / 3 copied.
20/07/13 17:07:50 INFO reduce.MergeManagerImpl: finalMerge called with 3 in-memory map-outputs and 0 on-disk map-outputs
20/07/13 17:07:51 INFO mapred.Merger: Merging 3 sorted segments
20/07/13 17:07:51 INFO mapred.Merger: Down to the last merge-pass, with 3 segments left of total size: 258 bytes
20/07/13 17:07:51 INFO reduce.MergeManagerImpl: Merged 3 segments, 270 bytes to disk to satisfy reduce memory limit
20/07/13 17:07:51 INFO reduce.MergeManagerImpl: Merging 1 files, 270 bytes from disk
20/07/13 17:07:51 INFO reduce.MergeManagerImpl: Merging 0 segments, 0 bytes from memory into reduce
20/07/13 17:07:51 INFO mapred.Merger: Merging 1 sorted segments
20/07/13 17:07:51 INFO mapred.Merger: Down to the last merge-pass, with 1 segments left of total size: 262 bytes
20/07/13 17:07:51 INFO mapred.LocalJobRunner: 3 / 3 copied.
20/07/13 17:07:51 INFO mapreduce.Job: map 100% reduce 67%
20/07/13 17:08:01 INFO mapred.LocalJobRunner: reduce > reduce
20/07/13 17:08:02 INFO mapreduce.Job: map 100% reduce 100%
20/07/13 17:08:10 INFO mapred.Task: Task:attempt_local508204586_0001_r_000002_0 is done. And is in the process of committing
20/07/13 17:08:10 INFO mapred.LocalJobRunner: reduce > reduce
20/07/13 17:08:10 INFO mapred.Task: Task attempt_local508204586_0001_r_000002_0 is allowed to commit now
20/07/13 17:08:10 INFO output.FileOutputCommitter: Saved output of task 'attempt_local508204586_0001_r_000002_0' to hdfs://hadMaster01:9000/output/_temporary/0/task_local508204586_0001_r_000002
20/07/13 17:08:10 INFO mapred.LocalJobRunner: reduce > reduce
20/07/13 17:08:10 INFO mapred.Task: Task 'attempt_local508204586_0001_r_000002_0' done.
20/07/13 17:08:10 INFO mapred.LocalJobRunner: Finishing task: attempt_local508204586_0001_r_000002_0
20/07/13 17:08:10 INFO mapred.LocalJobRunner: reduce task executor complete.
20/07/13 17:08:11 INFO mapreduce.Job: Job job_local508204586_0001 completed successfully
20/07/13 17:08:12 INFO mapreduce.Job: Counters: 35
File System Counters
FILE: Number of bytes read=12661
FILE: Number of bytes written=2974574
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=624
HDFS: Number of bytes written=197
HDFS: Number of read operations=72
HDFS: Number of large read operations=0
HDFS: Number of write operations=24
Map-Reduce Framework
Map input records=7
Map output records=99
Map output bytes=614
Map output materialized bytes=866
Input split bytes=306
Combine input records=0
Combine output records=0
Reduce input groups=24
Reduce shuffle bytes=866
Reduce input records=99
Reduce output records=24
Spilled Records=198
Shuffled Maps =9
Failed Shuffles=0
Merged Map outputs=9
GC time elapsed (ms)=12
Total committed heap usage (bytes)=2236612608
Shuffle Errors
BAD_ID=0
CONNECTION=0
IO_ERROR=0
WRONG_LENGTH=0
WRONG_MAP=0
WRONG_REDUCE=0
File Input Format Counters
Bytes Read=111
File Output Format Counters
Bytes Written=103
Process finished with exit code 0
总结:
1)、该程序有一个main()方法,用来启动任务。其中job对象存储了该程序运行的必要信息,比如指定Mapper类和Reducer类。
比如:job.setMapperClass(WordCountMapper.class);job.setReducerClass(WordCountReducer.class)
2)、MapReduce程序的业务编码分为两个部分。一部分配置程序的运行信息,一部分编写该MapReduce程序的业务逻辑。
该程序的业务逻辑分为map阶段和reduce阶段,代码分别继承Mapper类和Reducer类。
MapReduce程序的编写规范如下:
1)、用户编写的程序分为三个部分:Mapper、Reducer、Driver(提交运行MR程序的客户端)
2)、Mapper的输入和输出数据都是KV对的形式(KV的类型可以自定义)
3)、Reducer的输入数据类型对应Mapper的输出数据类型,也是KV对的形式
4)、Mapper中的业务逻辑写在map()方法中,Reducer的业务逻辑写在reduce()方法中
5)、map()方法(mapTask进程) 对每一个<k,v>调用一次
6)、reduceTask进程对每一组相同的<k,v>组调用一次reduce()方法
7)、用户自定义的Mapper和Reducer都要继承各自的父类
8)、整个程序需要一个Driver来进行提交,提交的是一个描述了各种必要信息的job对象。