1 package com.mengyao.hadoop.mapreduce; 2 3 import java.io.IOException; 4 5 import org.apache.hadoop.conf.Configuration; 6 import org.apache.hadoop.conf.Configured; 7 import org.apache.hadoop.fs.Path; 8 import org.apache.hadoop.io.NullWritable; 9 import org.apache.hadoop.io.Text; 10 import org.apache.hadoop.mapreduce.InputFormat; 11 import org.apache.hadoop.mapreduce.Job; 12 import org.apache.hadoop.mapreduce.Mapper; 13 import org.apache.hadoop.mapreduce.Reducer; 14 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 15 import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat; 16 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 17 import org.apache.hadoop.util.Tool; 18 import org.apache.hadoop.util.ToolRunner; 19 20 /** 21 * KeyValueTextInputFormat默认将行的第一个制表符分隔,前面的是key,后面的是value。并且Mapper输入的k1、v1必须是 Text, Text。 22 * mapreduce.input.keyvaluelinerecordreader.key.value.separator属性可以改变分隔符,默认为“\t”。官方API描述如下: 23 * An {@link InputFormat} for plain text files. Files are broken into lines. 24 * Either line feed or carriage-return are used to signal end of line. 25 * Each line is divided into key and value parts by a separator byte. If no 26 * such a byte exists, the key will be the entire line and value will be empty. 27 * The separator byte can be specified in config file under the attribute name 28 * mapreduce.input.keyvaluelinerecordreader.key.value.separator. The default 29 * is the tab character ('\t'). 30 * public class KeyValueTextInputFormat extends FileInputFormat<Text, Text> 31 * 32 * 此处应用场景为获取图书大纲,找出所有的一级索引,读取输入HDFS目录下的文件/mapreduces/bookOutline.txt,内容如下: 33 * 第1章 PostgresQL服务器简介 34 * 1.1 为什么在服务器中进行程序设计 35 * 1.2 关于本书的代码示例 36 * 1.3 超越简单函数 37 * 1.4 使用触发器管理相关数据 38 * 1.5 审核更改 39 * 1.6 数据清洗 40 * 1.7 定制排序方法 41 * 1.8 程序设计最佳实践 42 * 1.8.1 KISS——尽量简单(keep it simple stupid) 43 * 1.8.2 DRY——不要写重复的代码(don't repeat yourself) 44 * 1.8.3 YAGNI——你并不需要它(you ain'tgonnaneedit) 45 * 1.8.4 SOA——服务导向架构(service-oriented architecture) 46 * 1.8.5 类型的扩展 47 * 1.9 关于缓存 48 * 1.10 总结——为什么在服务器中进行程序设计 49 * 1.10.1 性能 50 * 1.10.2 易于维护 51 * 1.10.3 保证安全的简单方法 52 * 1.11 小结 53 * 第2章 服务器程序设计环境 54 * 2.1 购置成本 55 * 2.2 开发者的可用性 56 * 2.3 许可证书 57 * 2.4 可预测性 58 * 2.5 社区 59 * 2.6 过程化语言 60 * 2.6.1 平台兼容性 61 * 2.6.2 应用程序设计 62 * 2.6.3 更多基础 63 * 2.7 小结 64 * 第3章 第一个PL/pgsQL函数 65 * 3.1 为什么是PL/pgSQL 66 * 3.2 PL/pgSQL函数的结构 67 * ... 68 * 69 * 输出到HDFS目录下的文件/mapreduces/keyvaluetextinputformat/part-r-00000,内容如下: 70 * 第1章 71 * 第2章 72 * 第3章 73 * 74 * @author mengyao 75 * 76 */ 77 public class KeyValueTextInputFormatApp extends Configured implements Tool { 78 79 static class NLineInputFormatMapper extends Mapper<Text, Text, Text, NullWritable> { 80 81 private NullWritable outputValue; 82 83 @Override 84 protected void setup(Context context) 85 throws IOException, InterruptedException { 86 this.outputValue = NullWritable.get(); 87 } 88 89 @Override 90 protected void map(Text key, Text value, Context context) 91 throws IOException, InterruptedException { 92 //如果key值不为空则认为是一级索引 93 if (key.toString()!=null && !key.toString().isEmpty()) { 94 context.write(key, this.outputValue); 95 } 96 } 97 } 98 99 static class NLineInputFormatReducer extends Reducer<Text, NullWritable, Text, NullWritable> { 100 101 private NullWritable outputValue; 102 103 @Override 104 protected void setup(Context context) 105 throws IOException, InterruptedException { 106 this.outputValue = NullWritable.get(); 107 } 108 109 @Override 110 protected void reduce(Text key, Iterable<NullWritable> value, Context context) 111 throws IOException, InterruptedException { 112 context.write(key, outputValue); 113 } 114 } 115 116 @Override 117 public int run(String[] args) throws Exception { 118 Job job = Job.getInstance(getConf(), KeyValueTextInputFormatApp.class.getSimpleName()); 119 job.setJarByClass(KeyValueTextInputFormatApp.class); 120 121 job.setInputFormatClass(KeyValueTextInputFormat.class); 122 FileInputFormat.addInputPath(job, new Path(args[0])); 123 FileOutputFormat.setOutputPath(job, new Path(args[1])); 124 125 job.setMapperClass(NLineInputFormatMapper.class); 126 job.setMapOutputKeyClass(Text.class); 127 job.setMapOutputValueClass(NullWritable.class); 128 129 job.setReducerClass(NLineInputFormatReducer.class); 130 job.setOutputKeyClass(Text.class); 131 job.setOutputValueClass(NullWritable.class); 132 133 return job.waitForCompletion(true)?0:1; 134 } 135 136 public static int createJob(String[] args) { 137 Configuration conf = new Configuration(); 138 conf.set("dfs.datanode.socket.write.timeout", "7200000"); 139 conf.set("mapreduce.input.fileinputformat.split.minsize", "268435456"); 140 conf.set("mapreduce.input.fileinputformat.split.maxsize", "536870912"); 141 conf.set("mapreduce.job.jvm.numtasks", "-1"); 142 conf.set("mapreduce.map.speculative", "false"); 143 conf.set("mapreduce.reduce.speculative", "false"); 144 conf.set("mapreduce.map.maxattempts", "4"); 145 conf.set("mapreduce.reduce.maxattempts", "4"); 146 conf.set("mapreduce.map.skip.maxrecords", "0"); 147 int status = 0; 148 149 try { 150 status = ToolRunner.run(conf, new KeyValueTextInputFormatApp(), args); 151 } catch (Exception e) { 152 e.printStackTrace(); 153 } 154 155 return status; 156 } 157 158 public static void main(String[] args) { 159 args = new String[]{"/mapreduces/bookOutline.txt", "/mapreduces/keyvaluetextinputformat"}; 160 if (args.length!=2) { 161 System.out.println("Usage: "+KeyValueTextInputFormatApp.class.getName()+" Input paramters <INPUT_PATH> <OUTPUT_PATH>"); 162 } else { 163 int status = createJob(args); 164 System.exit(status); 165 } 166 } 167 168 }