上一篇中,演示了如何使用
CombineFileInputFormat 来优化当有多个输入小文件时,减少起动的map task个数。
在自定义的MyCombineFileInputFormat中的MyRecordReader是简单代理了LineRecordReader。
其它我也还可以在这个地方做更多的东西。
本次实验是使用自定义的RecordReader从split中自定义 key value。
自定义MyKey
Mapper程序代码
可以看到使用MyRecordReader返回自定义Key后,Map 函数得到了很大的简化。
为了方便以后查看,把主程序代码也贴上来。
而且使用自定义的MyRecordReader的好处不限于此,稍后一篇讨论使用TotalOrderPartitioner时会发现,对于词频统计,使用TotalOrderPartitioner 自定义RecordReader是必要的。
在自定义的MyCombineFileInputFormat中的MyRecordReader是简单代理了LineRecordReader。
其它我也还可以在这个地方做更多的东西。
本次实验是使用自定义的RecordReader从split中自定义 key value。
自定义MyKey
自定义的key 需要实现WritableComparable 接口。
点击(此处)折叠或打开
- package wordcount;
-
- import java.io.DataInput;
- import java.io.DataOutput;
- import java.io.IOException;
-
- import org.apache.hadoop.io.WritableComparable;
-
- public class MyKey implements WritableComparable<MyKey> {
-
- private char c;
- @Override
- public void write(DataOutput out) throws IOException {
- out.writeChar(c);
- }
-
- @Override
- public void readFields(DataInput in) throws IOException {
- c= in.readChar();
- }
-
- @Override
- public int compareTo(MyKey key) {
- if(c==key.c)
- return 0;
- else if(c> key.c)
- return 1;
- else
- return -1;
- }
-
- public char getC() {
- return c;
- }
-
- public void setC(char c) {
- this.c = c;
- }
-
-
- }
自定义CombinedFilesInputFormat 自定义RecordReader
点击(此处)折叠或打开
- package wordcount;
-
- import java.io.IOException;
-
- import org.apache.commons.lang.StringUtils;
- import org.apache.hadoop.io.IntWritable;
- import org.apache.hadoop.io.LongWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.InputSplit;
- import org.apache.hadoop.mapreduce.RecordReader;
- import org.apache.hadoop.mapreduce.TaskAttemptContext;
- import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
- import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;
- import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
- import org.apache.hadoop.mapreduce.lib.input.FileSplit;
- import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
- import org.apache.log4j.Logger;
-
-
- public class MyCombinedFilesInputFormat extends CombineFileInputFormat<LongWritable, Text> {
-
- @SuppressWarnings({ "unchecked", "rawtypes" })
- @Override
- public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException {
- return new CombineFileRecordReader((CombineFileSplit) split,context,MyCombinedFilesRecordReader.class);
- }
-
- public static class MyCombinedFilesRecordReader extends RecordReader<MyKey, IntWritable> {
- private int index;
- private LineRecordReader reader;
-
- private String tValue;
- private int pos=0;
-
- private MyKey key = new MyKey();
-
- Logger log = Logger.getLogger(MyCombinedFilesRecordReader.class);
- public MyCombinedFilesRecordReader(CombineFileSplit split, TaskAttemptContext context, Integer index) {
- this.index = index;
- reader = new LineRecordReader();
- }
-
- @Override
- public void initialize(InputSplit split, TaskAttemptContext context)
- throws IOException, InterruptedException {
- CombineFileSplit cfsplit = (CombineFileSplit) split;
- FileSplit fileSplit = new FileSplit(cfsplit.getPath(index),
- cfsplit.getOffset(index),
- cfsplit.getLength(index),
- cfsplit.getLocations()
- );
- reader.initialize(fileSplit, context);
- }
-
- @Override
- public boolean nextKeyValue() throws IOException, InterruptedException {
- if(StringUtils.isEmpty(tValue)||pos>=tValue.length()-1){
- if(reader.nextKeyValue()){
- pos = 0;
- this.tValue = reader.getCurrentValue().toString();
- return true;
- }
- else{
- return false;
- }
- }
- else{
- pos ++;
- if(tValue.charAt(pos)<='z' && tValue.charAt(pos)>='A'){
- return true;
- }
- else{
- return nextKeyValue();
- }
- }
- }
-
- @Override
- public MyKey getCurrentKey() throws IOException,
- InterruptedException {
- key.setC(tValue.charAt(pos));
- return key;
- }
-
- @Override
- public IntWritable getCurrentValue() throws IOException, InterruptedException {
- return new IntWritable(1);
- }
-
- @Override
- public float getProgress() throws IOException, InterruptedException {
- return reader.getProgress();
- }
-
- @Override
- public void close() throws IOException {
- reader.close();
- }
-
- }
- }
可以看到使用MyRecordReader返回自定义Key后,Map 函数得到了很大的简化。
点击(此处)折叠或打开
- public static class MyWordCountMapper extends
- Mapper<MyKey, NullWritable, Text, IntWritable> {
- Text mKey = new Text();
- IntWritable mValue = new IntWritable(1);
- @Override
- protected void map(MyKey key, NullWritable value, Context context)
- throws IOException, InterruptedException {
- mKey.set(String.valueOf(key.getC()));
- context.write(mKey, mValue);
- }
- }
点击(此处)折叠或打开
- package wordcount;
-
- import java.io.IOException;
-
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.conf.Configured;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.IntWritable;
- import org.apache.hadoop.io.NullWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.Mapper;
- import org.apache.hadoop.mapreduce.Reducer;
- import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
- import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
- import org.apache.hadoop.util.Tool;
- import org.apache.hadoop.util.ToolRunner;
- import org.apache.log4j.Logger;
-
- public class MyWordCountJob extends Configured implements Tool {
- Logger log = Logger.getLogger(MyWordCountJob.class);
-
- public static class MyWordCountMapper extends
- Mapper<MyKey, NullWritable, Text, IntWritable> {
- Text mKey = new Text();
- IntWritable mValue = new IntWritable(1);
- @Override
- protected void map(MyKey key, NullWritable value, Context context)
- throws IOException, InterruptedException {
- mKey.set(String.valueOf(key.getC()));
- context.write(mKey, mValue);
- }
- }
-
- public static class MyWordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
- Text rkey = new Text();
- IntWritable rvalue = new IntWritable(1);
- @Override
- protected void reduce(Text key, Iterable<IntWritable> values,Context context)
- throws IOException, InterruptedException {
-
- int n=0;
- for(IntWritable value :values){
- n+= value.get();
- }
- rvalue.set(n);
- context.write(key, rvalue);
- }
- }
-
- @Override
- public int run(String[] args) throws Exception {
- //valid the parameters
- if(args.length !=2){
- return -1;
- }
-
- Job job = Job.getInstance(getConf(), "MyWordCountJob");
- job.setJarByClass(MyWordCountJob.class);
-
- Path inPath = new Path(args[0]);
- Path outPath = new Path(args[1]);
-
- outPath.getFileSystem(getConf()).delete(outPath,true);
- TextInputFormat.setInputPaths(job, inPath);
- TextOutputFormat.setOutputPath(job, outPath);
-
-
- job.setMapperClass(MyWordCountJob.MyWordCountMapper.class);
- job.setReducerClass(MyWordCountJob.MyWordCountReducer.class);
-
- job.setInputFormatClass(MyCombinedFilesInputFormat.class);
- MyCombinedFilesInputFormat.setMaxInputSplitSize(job, 1024*1024*64);
- job.setOutputFormatClass(TextOutputFormat.class);
- job.setMapOutputKeyClass(Text.class);
- job.setMapOutputValueClass(IntWritable.class);
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(IntWritable.class);
-
-
- return job.waitForCompletion(true)?0:1;
- }
- public static void main(String [] args){
- int result = 0;
- try {
- result = ToolRunner.run(new Configuration(), new MyWordCountJob(), args);
- } catch (Exception e) {
- e.printStackTrace();
- }
- System.exit(result);
- }
-
- }
而且使用自定义的MyRecordReader的好处不限于此,稍后一篇讨论使用TotalOrderPartitioner时会发现,对于词频统计,使用TotalOrderPartitioner 自定义RecordReader是必要的。
来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/30066956/viewspace-2109264/,如需转载,请注明出处,否则将追究法律责任。
转载于:http://blog.itpub.net/30066956/viewspace-2109264/