Hadoop使用CombineFileInputFormat处理大量小文件接口实现（Hadoop-1.0.4）

最新推荐文章于 2022-08-26 18:23:33 发布

partynew

最新推荐文章于 2022-08-26 18:23:33 发布

阅读量4.1k

点赞数

分类专栏：数据挖掘 Hadoop 文章标签：算法 CombineFileInputForm Hadoop

本文链接：https://blog.csdn.net/partynew/article/details/8984427

版权

数据挖掘同时被 2 个专栏收录

4 篇文章 0 订阅

订阅专栏

Hadoop

2 篇文章 0 订阅

订阅专栏

Configuration设置块大小64M

Configuration conf = new Configuration();
conf.setLong(MyCombineFileInputFormat.SPLIT_MINSIZE_PERNODE, 64 * 1024 * 1024);
conf.setLong(MyCombineFileInputFormat.SPLIT_MINSIZE_PERRACK, 64 * 1024 * 1024);
conf.setLong("mapreduce.input.fileinputformat.split.maxsize", 64 * 1024 * 1024);

CombineFileInputFormat具体实现：（内部的RecordReader仿照TextInputFormat的LineRecordReader）

自定义KEY:InputSplitFile类，两个成员：offset和filename。作为map的输入KEY。

package com.****.hadoop;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
import org.apache.hadoop.util.LineReader;

public class MyCombineFileInputFormat
		extends
			CombineFileInputFormat<InputSplitFile, Text> {
	@Override
	public RecordReader<InputSplitFile, Text> createRecordReader(
			InputSplit split, TaskAttemptContext context) throws IOException {
		return new CombineFileRecordReader<InputSplitFile, Text>(
				(CombineFileSplit) split, context,
				MyCombineFileRecordReader.class);
	}

}

class MyCombineFileRecordReader extends RecordReader<InputSplitFile, Text> {
	private static final Log LOG = LogFactory
			.getLog(MyCombineFileRecordReader.class);

	private CompressionCodecFactory compressionCodecs = null;
	private long start;
	private long pos;
	private long end;
	private Path path;
	private LineReader in;
	private int maxLineLength;
	private InputSplitFile key = null;
	private Text value = null;

	public MyCombineFileRecordReader(CombineFileSplit split,
			TaskAttemptContext context, Integer index) throws IOException {
		Configuration job = context.getConfiguration();
		this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength",
				Integer.MAX_VALUE);
		this.path = split.getPath(index);
		this.start = split.getOffset(index);
		this.end = start + split.getLength(index);
		compressionCodecs = new CompressionCodecFactory(job);
		final CompressionCodec codec = compressionCodecs.getCodec(this.path);
		boolean skipFirstLine = false;

		FileSystem fs = path.getFileSystem(job);
		FSDataInputStream fileIn = fs.open(split.getPath(index));
		if (codec != null) {
			in = new LineReader(codec.createInputStream(fileIn), job);
			end = Long.MAX_VALUE;
		} else {
			if (start != 0) {
				skipFirstLine = true;
				--start;
				fileIn.seek(start);
			}
			in = new LineReader(fileIn, job);
		}
		if (skipFirstLine) // skip first line and re-establish "startOffset".
		{
			start += in.readLine(new Text(), 0,
					(int) Math.min((long) Integer.MAX_VALUE, end - start));
		}
		this.pos = start;
	}

	@Override
	public void initialize(InputSplit genericSplit, TaskAttemptContext context)
			throws IOException, InterruptedException {
	}

	@Override
	public boolean nextKeyValue() throws IOException, InterruptedException {
		if (key == null) {
			key = new InputSplitFile();
			key.setFileName(path.getName());
		}
		key.setOffset(pos);
		if (value == null) {
			value = new Text();
		}
		int newSize = 0;
		while (pos < end) {
			newSize = in.readLine(value, maxLineLength,
					Math.max((int) Math.min(Integer.MAX_VALUE, end - pos),
							maxLineLength));
			if (newSize == 0) {
				break;
			}
			pos += newSize;
			if (newSize < maxLineLength) {
				break;
			}

			// line too long. try again
			LOG.info("Skipped line of size " + newSize + " at pos "
					+ (pos - newSize));
		}
		if (newSize == 0) {
			key = null;
			value = null;
			return false;
		} else {
			return true;
		}
	}

	@Override
	public InputSplitFile getCurrentKey() throws IOException,
			InterruptedException {
		return key;
	}

	@Override
	public Text getCurrentValue() throws IOException, InterruptedException {
		return value;
	}

	@Override
	public float getProgress() throws IOException, InterruptedException {
		if (start == end) {
			return 0.0f;
		} else {
			return Math.min(1.0f, (pos - start) / (float) (end - start));
		}
	}

	@Override
	public void close() throws IOException {
		if (in != null)
			in.close();
	}

}

class InputSplitFile implements WritableComparable<InputSplitFile> {

	private long offset;
	private String fileName;

	public long getOffset() {
		return offset;
	}

	public void setOffset(long offset) {
		this.offset = offset;
	}

	public String getFileName() {
		return fileName;
	}

	public void setFileName(String fileName) {
		this.fileName = fileName;
	}

	public void readFields(DataInput in) throws IOException {
		this.offset = in.readLong();
		this.fileName = Text.readString(in);
	}

	public void write(DataOutput out) throws IOException {
		out.writeLong(offset);
		Text.writeString(out, fileName);
	}

	public int compareTo(InputSplitFile o) {
		InputSplitFile that = (InputSplitFile) o;

		int f = this.fileName.compareTo(that.fileName);
		if (f == 0) {
			return (int) Math.signum((double) (this.offset - that.offset));
		}
		return f;
	}

	public boolean equals(InputSplitFile obj) {
		if (obj instanceof InputSplitFile)
			return this.compareTo(obj) == 0;
		return false;
	}

	@Override
	public int hashCode() {
		assert false : "hashCode not designed";
		return 42; // an arbitrary constant
	}
}

partynew

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
Hadoop使用CombineFileInputFormat处理大量小文件接口实现（Hadoop-1.0.4）

Configuration设置块大小64MConfiguration conf = new Configuration();conf.setLong(MyCombineFileInputFormat.SPLIT_MINSIZE_PERNODE, 64 * 1024 * 1024);conf.setLong(MyCombineFileInputFormat.SPLIT_MINSIZE
复制链接

扫一扫