Hadoop使用CombineFileInputFormat处理大量小文件接口实现(Hadoop-1.0.4)

Configuration设置块大小64M

Configuration conf = new Configuration();
conf.setLong(MyCombineFileInputFormat.SPLIT_MINSIZE_PERNODE, 64 * 1024 * 1024);
conf.setLong(MyCombineFileInputFormat.SPLIT_MINSIZE_PERRACK, 64 * 1024 * 1024);
conf.setLong("mapreduce.input.fileinputformat.split.maxsize", 64 * 1024 * 1024);

CombineFileInputFormat具体实现:(内部的RecordReader仿照TextInputFormat的LineRecordReader)

自定义KEY:InputSplitFile类,两个成员:offset和filename。作为map的输入KEY。

package com.****.hadoop;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
import org.apache.hadoop.util.LineReader;

public class MyCombineFileInputFormat
		extends
			CombineFileInputFormat<InputSplitFile, Text> {
	@Override
	public RecordReader<InputSplitFile, Text> createRecordReader(
			InputSplit split, TaskAttemptContext context) throws IOException {
		return new CombineFileRecordReader<InputSplitFile, Text>(
				(CombineFileSplit) split, context,
				MyCombineFileRecordReader.class);
	}

}

class MyCombineFileRecordReader extends RecordReader<InputSplitFile, Text> {
	private static final Log LOG = LogFactory
			.getLog(MyCombineFileRecordReader.class);

	private CompressionCodecFactory compressionCodecs = null;
	private long start;
	private long pos;
	private long end;
	private Path path;
	private LineReader in;
	private int maxLineLength;
	private InputSplitFile key = null;
	private Text value = null;

	public MyCombineFileRecordReader(CombineFileSplit split,
			TaskAttemptContext context, Integer index) throws IOException {
		Configuration job = context.getConfiguration();
		this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength",
				Integer.MAX_VALUE);
		this.path = split.getPath(index);
		this.start = split.getOffset(index);
		this.end = start + split.getLength(index);
		compressionCodecs = new CompressionCodecFactory(job);
		final CompressionCodec codec = compressionCodecs.getCodec(this.path);
		boolean skipFirstLine = false;

		FileSystem fs = path.getFileSystem(job);
		FSDataInputStream fileIn = fs.open(split.getPath(index));
		if (codec != null) {
			in = new LineReader(codec.createInputStream(fileIn), job);
			end = Long.MAX_VALUE;
		} else {
			if (start != 0) {
				skipFirstLine = true;
				--start;
				fileIn.seek(start);
			}
			in = new LineReader(fileIn, job);
		}
		if (skipFirstLine) // skip first line and re-establish "startOffset".
		{
			start += in.readLine(new Text(), 0,
					(int) Math.min((long) Integer.MAX_VALUE, end - start));
		}
		this.pos = start;
	}

	@Override
	public void initialize(InputSplit genericSplit, TaskAttemptContext context)
			throws IOException, InterruptedException {
	}

	@Override
	public boolean nextKeyValue() throws IOException, InterruptedException {
		if (key == null) {
			key = new InputSplitFile();
			key.setFileName(path.getName());
		}
		key.setOffset(pos);
		if (value == null) {
			value = new Text();
		}
		int newSize = 0;
		while (pos < end) {
			newSize = in.readLine(value, maxLineLength,
					Math.max((int) Math.min(Integer.MAX_VALUE, end - pos),
							maxLineLength));
			if (newSize == 0) {
				break;
			}
			pos += newSize;
			if (newSize < maxLineLength) {
				break;
			}

			// line too long. try again
			LOG.info("Skipped line of size " + newSize + " at pos "
					+ (pos - newSize));
		}
		if (newSize == 0) {
			key = null;
			value = null;
			return false;
		} else {
			return true;
		}
	}

	@Override
	public InputSplitFile getCurrentKey() throws IOException,
			InterruptedException {
		return key;
	}

	@Override
	public Text getCurrentValue() throws IOException, InterruptedException {
		return value;
	}

	@Override
	public float getProgress() throws IOException, InterruptedException {
		if (start == end) {
			return 0.0f;
		} else {
			return Math.min(1.0f, (pos - start) / (float) (end - start));
		}
	}

	@Override
	public void close() throws IOException {
		if (in != null)
			in.close();
	}

}

class InputSplitFile implements WritableComparable<InputSplitFile> {

	private long offset;
	private String fileName;

	public long getOffset() {
		return offset;
	}

	public void setOffset(long offset) {
		this.offset = offset;
	}

	public String getFileName() {
		return fileName;
	}

	public void setFileName(String fileName) {
		this.fileName = fileName;
	}

	public void readFields(DataInput in) throws IOException {
		this.offset = in.readLong();
		this.fileName = Text.readString(in);
	}

	public void write(DataOutput out) throws IOException {
		out.writeLong(offset);
		Text.writeString(out, fileName);
	}

	public int compareTo(InputSplitFile o) {
		InputSplitFile that = (InputSplitFile) o;

		int f = this.fileName.compareTo(that.fileName);
		if (f == 0) {
			return (int) Math.signum((double) (this.offset - that.offset));
		}
		return f;
	}

	public boolean equals(InputSplitFile obj) {
		if (obj instanceof InputSplitFile)
			return this.compareTo(obj) == 0;
		return false;
	}

	@Override
	public int hashCode() {
		assert false : "hashCode not designed";
		return 42; // an arbitrary constant
	}
}





  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值