分别建立三个文件:
file1txt
file2.txt
file3.txt
文件内容分别是:
MapReduce is simple
和
MapReduce is powerful is simple
和
Hello MapReduce bye MapReduce
1 import java.io.IOException; 2 import java.util.StringTokenizer; 3 import org.apache.hadoop.conf.Configuration; 4 import org.apache.hadoop.fs.Path; 5 import org.apache.hadoop.io.Text; 6 import org.apache.hadoop.mapreduce.Job; 7 import org.apache.hadoop.mapreduce.Mapper; 8 import org.apache.hadoop.mapreduce.Reducer; 9 import org.apache.hadoop.mapreduce.lib.input.FileSplit; 10 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 11 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 12 13 public class InvertedIndex{ 14 private static class Map extends Mapper<Object, Text, Text, Text>{ 15 private Text keyInfo =new Text(); //存单词与uri 16 private Text valueInfo =new Text(); //存词频 17 private FileSplit split; //存储split对象 18 @Override 19 protected void map(Object key, Text value, Mapper<Object, Text, Text, Text>.Context context) 20 throws IOException, InterruptedException { 21 // TODO Auto-generated method stub 22 split=(FileSplit) context.getInputSplit(); 23 StringTokenizer itr=new StringTokenizer(value.toString()); 24 while (itr.hasMoreTokens()) { 25 int splitIndex=split.getPath().toString().indexOf("file"); 26 keyInfo.set(itr.nextToken()+":"+split.getPath().toString().substring(splitIndex)); 27 valueInfo.set("1"); 28 context.write(keyInfo, valueInfo); 29 } 30 } 31 } 32 33 public static class Combine extends Reducer<Text, Text, Text, Text>{ //相同的key,value累加 34 private Text info=new Text(); 35 @Override 36 protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context) 37 throws IOException, InterruptedException { 38 // TODO Auto-generated method stub 39 int sum=0; 40 for(Text value:values){ 41 sum+=Integer.parseInt(value.toString()); 42 } 43 int splitIndex=key.toString().indexOf(":"); 44 info.set(key.toString().substring(splitIndex+1)+":"+sum); //与下一行顺序不能调换 45 key.set(key.toString().substring(0, splitIndex)); 46 context.write(key, info); 47 } 48 } 49 public static class Reduce extends Reducer<Text, Text, Text, Text>{ 50 private Text result=new Text(); 51 @Override 52 protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context) 53 throws IOException, InterruptedException { 54 // TODO Auto-generated method stub 55 String file=new String(); 56 for(Text value:values){ 57 file+=value.toString()+";"; 58 } 59 result.set(file); 60 context.write(key, result); 61 } 62 } 63 public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 64 Configuration conf=new Configuration(); 65 @SuppressWarnings("deprecation") 66 Job job=new Job(conf,"invertedIndex"); 67 job.setJarByClass(InvertedIndex.class); 68 job.setMapperClass(Map.class); 69 job.setCombinerClass(Combine.class); 70 job.setReducerClass(Reduce.class); 71 job.setMapOutputKeyClass(Text.class); 72 job.setMapOutputValueClass(Text.class); 73 job.setOutputKeyClass(Text.class); 74 job.setOutputValueClass(Text.class); 75 FileInputFormat.setInputPaths(job, new Path(args[0])); 76 FileOutputFormat.setOutputPath(job, new Path(args[1])); 77 System.exit(job.waitForCompletion(true)?0:1); 78 } 79 }
1 2017-03-15 22:16:27,071 WARN [org.apache.hadoop.util.NativeCodeLoader] - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable 2 2017-03-15 22:16:27,748 INFO [org.apache.hadoop.conf.Configuration.deprecation] - session.id is deprecated. Instead, use dfs.metrics.session-id 3 2017-03-15 22:16:27,749 INFO [org.apache.hadoop.metrics.jvm.JvmMetrics] - Initializing JVM Metrics with processName=JobTracker, sessionId= 4 2017-03-15 22:16:28,058 WARN [org.apache.hadoop.mapreduce.JobResourceUploader] - Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this. 5 2017-03-15 22:16:28,061 WARN [org.apache.hadoop.mapreduce.JobResourceUploader] - No job jar file set. User classes may not be found. See Job or Job#setJar(String). 6 2017-03-15 22:16:28,124 INFO [org.apache.hadoop.mapreduce.lib.input.FileInputFormat] - Total input paths to process : 3 7 2017-03-15 22:16:28,171 INFO [org.apache.hadoop.mapreduce.JobSubmitter] - number of splits:3 8 2017-03-15 22:16:28,289 INFO [org.apache.hadoop.mapreduce.JobSubmitter] - Submitting tokens for job: job_local1466554694_0001 9 2017-03-15 22:16:28,463 INFO [org.apache.hadoop.mapreduce.Job] - The url to track the job: http://localhost:8080/ 10 2017-03-15 22:16:28,468 INFO [org.apache.hadoop.mapreduce.Job] - Running job: job_local1466554694_0001 11 2017-03-15 22:16:28,473 INFO [org.apache.hadoop.mapred.LocalJobRunner] - OutputCommitter set in config null 12 2017-03-15 22:16:28,479 INFO [org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter] - File Output Committer Algorithm version is 1 13 2017-03-15 22:16:28,482 INFO [org.apache.hadoop.mapred.LocalJobRunner] - OutputCommitter is org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter 14 2017-03-15 22:16:28,571 INFO [org.apache.hadoop.mapred.LocalJobRunner] - Waiting for map tasks 15 2017-03-15 22:16:28,571 INFO [org.apache.hadoop.mapred.LocalJobRunner] - Starting task: attempt_local1466554694_0001_m_000000_0 16 2017-03-15 22:16:28,609 INFO [org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter] - File Output Committer Algorithm version is 1 17 2017-03-15 22:16:28,621 INFO [org.apache.hadoop.mapred.Task] - Using ResourceCalculatorProcessTree : [ ] 18 2017-03-15 22:16:28,624 INFO [org.apache.hadoop.mapred.MapTask] - Processing split: hdfs://localhost:9000/user/hadoop/input/file2.txt:0+32 19 2017-03-15 22:16:28,679 INFO [org.apache.hadoop.mapred.MapTask] - (EQUATOR) 0 kvi 26214396(104857584) 20 2017-03-15 22:16:28,679 INFO [org.apache.hadoop.mapred.MapTask] - mapreduce.task.io.sort.mb: 100 21 2017-03-15 22:16:28,679 INFO [org.apache.hadoop.mapred.MapTask] - soft limit at 83886080 22 2017-03-15 22:16:28,679 INFO [org.apache.hadoop.mapred.MapTask] - bufstart = 0; bufvoid = 104857600 23 2017-03-15 22:16:28,679 INFO [org.apache.hadoop.mapred.MapTask] - kvstart = 26214396; length = 6553600 24 2017-03-15 22:16:28,683 INFO [org.apache.hadoop.mapred.MapTask] - Map output collector class = org.apache.hadoop.mapred.MapTask$MapOutputBuffer 25 2017-03-15 22:16:28,754 INFO [org.apache.hadoop.mapred.LocalJobRunner] - 26 2017-03-15 22:16:28,756 INFO [org.apache.hadoop.mapred.MapTask] - Starting flush of map output 27 2017-03-15 22:16:28,757 INFO [org.apache.hadoop.mapred.MapTask] - Spilling map output 28 2017-03-15 22:16:28,757 INFO [org.apache.hadoop.mapred.MapTask] - bufstart = 0; bufend = 92; bufvoid = 104857600 29 2017-03-15 22:16:28,757 INFO [org.apache.hadoop.mapred.MapTask] - kvstart = 26214396(104857584); kvend = 26214380(104857520); length = 17/6553600 30 2017-03-15 22:16:28,770 INFO [org.apache.hadoop.mapred.MapTask] - Finished spill 0 31 2017-03-15 22:16:28,774 INFO [org.apache.hadoop.mapred.Task] - Task:attempt_local1466554694_0001_m_000000_0 is done. And is in the process of committing 32 2017-03-15 22:16:28,785 INFO [org.apache.hadoop.mapred.LocalJobRunner] - map 33 2017-03-15 22:16:28,785 INFO [org.apache.hadoop.mapred.Task] - Task 'attempt_local1466554694_0001_m_000000_0' done. 34 2017-03-15 22:16:28,785 INFO [org.apache.hadoop.mapred.LocalJobRunner] - Finishing task: attempt_local1466554694_0001_m_000000_0 35 2017-03-15 22:16:28,786 INFO [org.apache.hadoop.mapred.LocalJobRunner] - Starting task: attempt_local1466554694_0001_m_000001_0 36 2017-03-15 22:16:28,791 INFO [org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter] - File Output Committer Algorithm version is 1 37 2017-03-15 22:16:28,792 INFO [org.apache.hadoop.mapred.Task] - Using ResourceCalculatorProcessTree : [ ] 38 2017-03-15 22:16:28,793 INFO [org.apache.hadoop.mapred.MapTask] - Processing split: hdfs://localhost:9000/user/hadoop/input/file3.txt:0+30 39 2017-03-15 22:16:28,823 INFO [org.apache.hadoop.mapred.MapTask] - (EQUATOR) 0 kvi 26214396(104857584) 40 2017-03-15 22:16:28,823 INFO [org.apache.hadoop.mapred.MapTask] - mapreduce.task.io.sort.mb: 100 41 2017-03-15 22:16:28,823 INFO [org.apache.hadoop.mapred.MapTask] - soft limit at 83886080 42 2017-03-15 22:16:28,823 INFO [org.apache.hadoop.mapred.MapTask] - bufstart = 0; bufvoid = 104857600 43 2017-03-15 22:16:28,823 INFO [org.apache.hadoop.mapred.MapTask] - kvstart = 26214396; length = 6553600 44 2017-03-15 22:16:28,824 INFO [org.apache.hadoop.mapred.MapTask] - Map output collector class = org.apache.hadoop.mapred.MapTask$MapOutputBuffer 45 2017-03-15 22:16:28,831 INFO [org.apache.hadoop.mapred.LocalJobRunner] - 46 2017-03-15 22:16:28,832 INFO [org.apache.hadoop.mapred.MapTask] - Starting flush of map output 47 2017-03-15 22:16:28,832 INFO [org.apache.hadoop.mapred.MapTask] - Spilling map output 48 2017-03-15 22:16:28,832 INFO [org.apache.hadoop.mapred.MapTask] - bufstart = 0; bufend = 78; bufvoid = 104857600 49 2017-03-15 22:16:28,832 INFO [org.apache.hadoop.mapred.MapTask] - kvstart = 26214396(104857584); kvend = 26214384(104857536); length = 13/6553600 50 2017-03-15 22:16:28,834 INFO [org.apache.hadoop.mapred.MapTask] - Finished spill 0 51 2017-03-15 22:16:28,835 INFO [org.apache.hadoop.mapred.Task] - Task:attempt_local1466554694_0001_m_000001_0 is done. And is in the process of committing 52 2017-03-15 22:16:28,839 INFO [org.apache.hadoop.mapred.LocalJobRunner] - map 53 2017-03-15 22:16:28,839 INFO [org.apache.hadoop.mapred.Task] - Task 'attempt_local1466554694_0001_m_000001_0' done. 54 2017-03-15 22:16:28,839 INFO [org.apache.hadoop.mapred.LocalJobRunner] - Finishing task: attempt_local1466554694_0001_m_000001_0 55 2017-03-15 22:16:28,839 INFO [org.apache.hadoop.mapred.LocalJobRunner] - Starting task: attempt_local1466554694_0001_m_000002_0 56 2017-03-15 22:16:28,842 INFO [org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter] - File Output Committer Algorithm version is 1 57 2017-03-15 22:16:28,843 INFO [org.apache.hadoop.mapred.Task] - Using ResourceCalculatorProcessTree : [ ] 58 2017-03-15 22:16:28,844 INFO [org.apache.hadoop.mapred.MapTask] - Processing split: hdfs://localhost:9000/user/hadoop/input/file1.txt:0+20 59 2017-03-15 22:16:28,888 INFO [org.apache.hadoop.mapred.MapTask] - (EQUATOR) 0 kvi 26214396(104857584) 60 2017-03-15 22:16:28,888 INFO [org.apache.hadoop.mapred.MapTask] - mapreduce.task.io.sort.mb: 100 61 2017-03-15 22:16:28,888 INFO [org.apache.hadoop.mapred.MapTask] - soft limit at 83886080 62 2017-03-15 22:16:28,888 INFO [org.apache.hadoop.mapred.MapTask] - bufstart = 0; bufvoid = 104857600 63 2017-03-15 22:16:28,888 INFO [org.apache.hadoop.mapred.MapTask] - kvstart = 26214396; length = 6553600 64 2017-03-15 22:16:28,889 INFO [org.apache.hadoop.mapred.MapTask] - Map output collector class = org.apache.hadoop.mapred.MapTask$MapOutputBuffer 65 2017-03-15 22:16:28,893 INFO [org.apache.hadoop.mapred.LocalJobRunner] - 66 2017-03-15 22:16:28,894 INFO [org.apache.hadoop.mapred.MapTask] - Starting flush of map output 67 2017-03-15 22:16:28,894 INFO [org.apache.hadoop.mapred.MapTask] - Spilling map output 68 2017-03-15 22:16:28,894 INFO [org.apache.hadoop.mapred.MapTask] - bufstart = 0; bufend = 56; bufvoid = 104857600 69 2017-03-15 22:16:28,894 INFO [org.apache.hadoop.mapred.MapTask] - kvstart = 26214396(104857584); kvend = 26214388(104857552); length = 9/6553600 70 2017-03-15 22:16:28,895 INFO [org.apache.hadoop.mapred.MapTask] - Finished spill 0 71 2017-03-15 22:16:28,896 INFO [org.apache.hadoop.mapred.Task] - Task:attempt_local1466554694_0001_m_000002_0 is done. And is in the process of committing 72 2017-03-15 22:16:28,899 INFO [org.apache.hadoop.mapred.LocalJobRunner] - map 73 2017-03-15 22:16:28,899 INFO [org.apache.hadoop.mapred.Task] - Task 'attempt_local1466554694_0001_m_000002_0' done. 74 2017-03-15 22:16:28,899 INFO [org.apache.hadoop.mapred.LocalJobRunner] - Finishing task: attempt_local1466554694_0001_m_000002_0 75 2017-03-15 22:16:28,899 INFO [org.apache.hadoop.mapred.LocalJobRunner] - map task executor complete. 76 2017-03-15 22:16:28,901 INFO [org.apache.hadoop.mapred.LocalJobRunner] - Waiting for reduce tasks 77 2017-03-15 22:16:28,902 INFO [org.apache.hadoop.mapred.LocalJobRunner] - Starting task: attempt_local1466554694_0001_r_000000_0 78 2017-03-15 22:16:28,914 INFO [org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter] - File Output Committer Algorithm version is 1 79 2017-03-15 22:16:28,915 INFO [org.apache.hadoop.mapred.Task] - Using ResourceCalculatorProcessTree : [ ] 80 2017-03-15 22:16:28,919 INFO [org.apache.hadoop.mapred.ReduceTask] - Using ShuffleConsumerPlugin: org.apache.hadoop.mapreduce.task.reduce.Shuffle@4fb755c9 81 2017-03-15 22:16:28,932 INFO [org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl] - MergerManager: memoryLimit=1945842432, maxSingleShuffleLimit=486460608, mergeThreshold=1284256000, ioSortFactor=10, memToMemMergeOutputsThreshold=10 82 2017-03-15 22:16:28,934 INFO [org.apache.hadoop.mapreduce.task.reduce.EventFetcher] - attempt_local1466554694_0001_r_000000_0 Thread started: EventFetcher for fetching Map Completion Events 83 2017-03-15 22:16:28,980 INFO [org.apache.hadoop.mapreduce.task.reduce.LocalFetcher] - localfetcher#1 about to shuffle output of map attempt_local1466554694_0001_m_000002_0 decomp: 64 len: 68 to MEMORY 84 2017-03-15 22:16:28,984 INFO [org.apache.hadoop.mapreduce.task.reduce.InMemoryMapOutput] - Read 64 bytes from map-output for attempt_local1466554694_0001_m_000002_0 85 2017-03-15 22:16:28,986 INFO [org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl] - closeInMemoryFile -> map-output of size: 64, inMemoryMapOutputs.size() -> 1, commitMemory -> 0, usedMemory ->64 86 2017-03-15 22:16:28,991 INFO [org.apache.hadoop.mapreduce.task.reduce.LocalFetcher] - localfetcher#1 about to shuffle output of map attempt_local1466554694_0001_m_000001_0 decomp: 64 len: 68 to MEMORY 87 2017-03-15 22:16:28,992 INFO [org.apache.hadoop.mapreduce.task.reduce.InMemoryMapOutput] - Read 64 bytes from map-output for attempt_local1466554694_0001_m_000001_0 88 2017-03-15 22:16:28,992 INFO [org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl] - closeInMemoryFile -> map-output of size: 64, inMemoryMapOutputs.size() -> 2, commitMemory -> 64, usedMemory ->128 89 2017-03-15 22:16:28,993 INFO [org.apache.hadoop.mapreduce.task.reduce.LocalFetcher] - localfetcher#1 about to shuffle output of map attempt_local1466554694_0001_m_000000_0 decomp: 87 len: 91 to MEMORY 90 2017-03-15 22:16:28,994 INFO [org.apache.hadoop.mapreduce.task.reduce.InMemoryMapOutput] - Read 87 bytes from map-output for attempt_local1466554694_0001_m_000000_0 91 2017-03-15 22:16:28,994 INFO [org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl] - closeInMemoryFile -> map-output of size: 87, inMemoryMapOutputs.size() -> 3, commitMemory -> 128, usedMemory ->215 92 2017-03-15 22:16:28,994 INFO [org.apache.hadoop.mapreduce.task.reduce.EventFetcher] - EventFetcher is interrupted.. Returning 93 2017-03-15 22:16:28,995 INFO [org.apache.hadoop.mapred.LocalJobRunner] - 3 / 3 copied. 94 2017-03-15 22:16:28,995 INFO [org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl] - finalMerge called with 3 in-memory map-outputs and 0 on-disk map-outputs 95 2017-03-15 22:16:29,004 INFO [org.apache.hadoop.mapred.Merger] - Merging 3 sorted segments 96 2017-03-15 22:16:29,005 INFO [org.apache.hadoop.mapred.Merger] - Down to the last merge-pass, with 3 segments left of total size: 183 bytes 97 2017-03-15 22:16:29,006 INFO [org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl] - Merged 3 segments, 215 bytes to disk to satisfy reduce memory limit 98 2017-03-15 22:16:29,006 INFO [org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl] - Merging 1 files, 215 bytes from disk 99 2017-03-15 22:16:29,007 INFO [org.apache.hadoop.mapreduce.task.reduce.MergeManagerImpl] - Merging 0 segments, 0 bytes from memory into reduce 100 2017-03-15 22:16:29,007 INFO [org.apache.hadoop.mapred.Merger] - Merging 1 sorted segments 101 2017-03-15 22:16:29,008 INFO [org.apache.hadoop.mapred.Merger] - Down to the last merge-pass, with 1 segments left of total size: 203 bytes 102 2017-03-15 22:16:29,008 INFO [org.apache.hadoop.mapred.LocalJobRunner] - 3 / 3 copied. 103 2017-03-15 22:16:29,049 INFO [org.apache.hadoop.conf.Configuration.deprecation] - mapred.skip.on is deprecated. Instead, use mapreduce.job.skiprecords 104 2017-03-15 22:16:29,186 INFO [org.apache.hadoop.mapred.Task] - Task:attempt_local1466554694_0001_r_000000_0 is done. And is in the process of committing 105 2017-03-15 22:16:29,190 INFO [org.apache.hadoop.mapred.LocalJobRunner] - 3 / 3 copied. 106 2017-03-15 22:16:29,190 INFO [org.apache.hadoop.mapred.Task] - Task attempt_local1466554694_0001_r_000000_0 is allowed to commit now 107 2017-03-15 22:16:29,209 INFO [org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter] - Saved output of task 'attempt_local1466554694_0001_r_000000_0' to hdfs://localhost:9000/user/hadoop/output/_temporary/0/task_local1466554694_0001_r_000000 108 2017-03-15 22:16:29,210 INFO [org.apache.hadoop.mapred.LocalJobRunner] - reduce > reduce 109 2017-03-15 22:16:29,210 INFO [org.apache.hadoop.mapred.Task] - Task 'attempt_local1466554694_0001_r_000000_0' done. 110 2017-03-15 22:16:29,210 INFO [org.apache.hadoop.mapred.LocalJobRunner] - Finishing task: attempt_local1466554694_0001_r_000000_0 111 2017-03-15 22:16:29,210 INFO [org.apache.hadoop.mapred.LocalJobRunner] - reduce task executor complete. 112 2017-03-15 22:16:29,473 INFO [org.apache.hadoop.mapreduce.Job] - Job job_local1466554694_0001 running in uber mode : false 113 2017-03-15 22:16:29,474 INFO [org.apache.hadoop.mapreduce.Job] - map 100% reduce 100% 114 2017-03-15 22:16:29,475 INFO [org.apache.hadoop.mapreduce.Job] - Job job_local1466554694_0001 completed successfully 115 2017-03-15 22:16:29,487 INFO [org.apache.hadoop.mapreduce.Job] - Counters: 35 116 File System Counters 117 FILE: Number of bytes read=4131 118 FILE: Number of bytes written=1128147 119 FILE: Number of read operations=0 120 FILE: Number of large read operations=0 121 FILE: Number of write operations=0 122 HDFS: Number of bytes read=258 123 HDFS: Number of bytes written=165 124 HDFS: Number of read operations=33 125 HDFS: Number of large read operations=0 126 HDFS: Number of write operations=6 127 Map-Reduce Framework 128 Map input records=3 129 Map output records=12 130 Map output bytes=226 131 Map output materialized bytes=227 132 Input split bytes=342 133 Combine input records=12 134 Combine output records=10 135 Reduce input groups=6 136 Reduce shuffle bytes=227 137 Reduce input records=10 138 Reduce output records=6 139 Spilled Records=20 140 Shuffled Maps =3 141 Failed Shuffles=0 142 Merged Map outputs=3 143 GC time elapsed (ms)=0 144 Total committed heap usage (bytes)=1592262656 145 Shuffle Errors 146 BAD_ID=0 147 CONNECTION=0 148 IO_ERROR=0 149 WRONG_LENGTH=0 150 WRONG_MAP=0 151 WRONG_REDUCE=0 152 File Input Format Counters 153 Bytes Read=82 154 File Output Format Counters 155 Bytes Written=165
结果:
1 Hello file3.txt:1; 2 MapReduce file3.txt:2;file1.txt:1;file2.txt:1; 3 bye file3.txt:1; 4 is file1.txt:1;file2.txt:2; 5 powerful file2.txt:1; 6 simple file2.txt:1;file1.txt:1;