NLineInputFormat继承自FileInputFormat,它实现按行而不是按文件大小来切分的文件的方法。
重写了FileInputFormat中的getSplits()和createRecordReader()方法,因为NLineInputFormat是在旧的mapreduce框架下写的,这里写了新框架下的NLineIputFormat,代码如下:
package com.yuankang.hadoop;
import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.lib.input.LineRecordReader; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.util.LineReader; import org.apache.hadoop.mapreduce.TaskAttemptContext;
public class NLineInputFormat extends FileInputFormat<LongWritable, Text>{ private int N = 1;
@Override public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) { return new LineRecordReader(); }
/** * Logically splits the set of input files for the job, splits N lines * of the input as one split. * * @see org.apache.hadoop.mapred.FileInputFormat#getSplits(JobConf, int) */ public List<InputSplit> getSplits(JobContext job ) throws IOException{ List<InputSplit> splits = new ArrayList<InputSplit>(); for (FileStatus file : listStatus(job)) { Path path = file.getPath(); FileSystem fs = path.getFileSystem(job.getConfiguration());
LineReader lr = null; try { FSDataInputStream in = fs.open(path); Configuration conf = job.getConfiguration(); lr = new LineReader(in, conf); N = conf.getInt("mapred.line.input.format.linespermap", 1); Text line = new Text(); int numLines = 0; long begin = 0; long length = 0; int num = -1; while ((num = lr.readLine(line)) > 0) { numLines++; length += num; if (numLines == N) { splits.add(new FileSplit(path, begin, length, new String[]{})); begin += length; length = 0; numLines = 0; } } if (numLines != 0) { splits.add(new FileSplit(path, begin, length, new String[]{})); }
} finally { if (lr != null) { lr.close(); } } } System.out.println("Total # of splits: " + splits.size()); return splits; }
} |