处理压缩小文件的,不是压缩文件,代码也是可以用的,只是这时候每个split是一个小文件
其他就不在多说了,提供3个完整java文件,只是缺少map和reduce的代码、main的代码,而且缺少的这几个java文件在前几个博客中已经提供了CompressedCombineFileInputFormat .java
package compressedCombineFile;
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
public class CompressedCombineFileInputFormat
extends CombineFileInputFormat<CompressedCombineFileWritable, Text> {
public CompressedCombineFileInputFormat(){
super();
}
public RecordReader<CompressedCombineFileWritable,Text>
createRecordReader(InputSplit split,
TaskAttemptContext context) throws IOException {
return new
CombineFileRecordReader<CompressedCombineFileWritable,
Text>((CombineFileSplit)split, context,
CompressedCombineFileRecordReader.class);
}
@Override
protected boolean isSplitable(JobContext context, Path file){
return false;
}
}
CompressedCombineFileRecordReader .java 设置读取格式
package compressedCombineFile;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
import org.apache.hadoop.util.LineReader;
/**
* RecordReader is responsible from extracting records from a chunk
* of the CombineFileSplit.
*/
public class CompressedCombineFileRecordReader
extends RecordReader<CompressedCombineFileWritable, Text> {
private long startOffset;
private long end;
private long pos;
private FileSystem fs;
private Path path;
private Path dPath;
private CompressedCombineFileWritable key = new CompressedCombineFileWritable();
private Text value;
private long rlength;
private FSDataInputStream fileIn;
private LineReader reader;
public CompressedCombineFileRecordReader(CombineFileSplit split,
TaskAttemptContext context, Integer index) throws IOException {
Configuration currentConf = context.getConfiguration();
this.path = split.getPath(index);
boolean isCompressed = findCodec(currentConf ,path);
if(isCompressed)
codecWiseDecompress(context.getConfiguration());
fs = this.path.getFileSystem(currentConf);
this.startOffset = split.getOffset(index);
if(isCompressed){
this.end = startOffset + rlength;
}else{
this.end = startOffset + split.getLength(index);
dPath =path;
}
boolean skipFirstLine = false;
fileIn = fs.open(dPath);
if(isCompressed) fs.deleteOnExit(dPath);
if (startOffset != 0) {
skipFirstLine = true;
--startOffset;
fileIn.seek(startOffset);
}
reader = new LineReader(fileIn);
if (skipFirstLine) {
startOffset += reader.readLine(new Text(), 0,
(int)Math.min((long)Integer.MAX_VALUE, end - startOffset));
}
this.pos = startOffset;
}
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
}
public void close() throws IOException { }
public float getProgress() throws IOException {
if (startOffset == end) {
return 0.0f;
} else {
return Math.min(1.0f, (pos - startOffset) / (float)
(end - startOffset));
}
}
public boolean nextKeyValue() throws IOException {
if (key.fileName== null) {
key = new CompressedCombineFileWritable();
key.fileName = dPath.getName();
}
key.offset = pos;
if (value == null) {
value = new Text();
}
int newSize = 0;
if (pos < end) {
newSize = reader.readLine(value);
pos += newSize;
}
if (newSize == 0) {
key = null;
value = null;
return false;
} else {
return true;
}
}
public CompressedCombineFileWritable getCurrentKey()
throws IOException, InterruptedException {
return key;
}
public Text getCurrentValue() throws IOException, InterruptedException {
return value;
}
private void codecWiseDecompress(Configuration conf) throws IOException{
CompressionCodecFactory factory = new CompressionCodecFactory(conf);
CompressionCodec codec = factory.getCodec(path);
if (codec == null) {
System.err.println("No Codec Found For " + path);
System.exit(1);
}
String outputUri =
CompressionCodecFactory.removeSuffix(path.toString(),
codec.getDefaultExtension());
dPath = new Path(outputUri);
InputStream in = null;
OutputStream out = null;
fs = this.path.getFileSystem(conf);
try {
in = codec.createInputStream(fs.open(path));
out = fs.create(dPath);
IOUtils.copyBytes(in, out, conf);
} finally {
IOUtils.closeStream(in);
IOUtils.closeStream(out);
rlength = fs.getFileStatus(dPath).getLen();
}
}
private boolean findCodec(Configuration conf, Path p){
CompressionCodecFactory factory = new CompressionCodecFactory(conf);
CompressionCodec codec = factory.getCodec(path);
if (codec == null)
return false;
else
return true;
}
}
CompressedCombineFileWritable .java
package compressedCombineFile;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
/**
* This record keeps filename,offset pairs.
*/
@SuppressWarnings("rawtypes")
public class CompressedCombineFileWritable implements WritableComparable {
public long offset;
public String fileName;
public CompressedCombineFileWritable() {
super();
}
public CompressedCombineFileWritable(long offset, String fileName) {
super();
this.offset = offset;
this.fileName = fileName;
}
public void readFields(DataInput in) throws IOException {
this.offset = in.readLong();
this.fileName = Text.readString(in);
}
public void write(DataOutput out) throws IOException {
out.writeLong(offset);
Text.writeString(out, fileName);
}
public int compareTo(Object o) {
CompressedCombineFileWritable that = (CompressedCombineFileWritable)o;
int f = this.fileName.compareTo(that.fileName);
if(f == 0) {
return (int)Math.signum((double)(this.offset - that.offset));
}
return f;
}
@Override
public boolean equals(Object obj) {
if(obj instanceof CompressedCombineFileWritable)
return this.compareTo(obj) == 0;
return false;
}
@Override
public int hashCode() {
final int hashPrime = 47;
int hash = 13;
hash = hashPrime* hash + (this.fileName != null ? this.fileName.hashCode() :0);
hash = hashPrime* hash + (int) (this.offset ^ (this.offset >>> 16));
return hash;
}
@Override
public String toString(){
return this.fileName+"-"+this.offset;
}
}