文章标题

最新推荐文章于 2023-10-19 15:03:50 发布

KNIFE_PAN

最新推荐文章于 2023-10-19 15:03:50 发布

阅读量563

点赞数

分类专栏： Hive 文章标签： hive sequencefile hadoop

本文链接：https://blog.csdn.net/knife_pan/article/details/44783065

版权

Hive 专栏收录该内容

9 篇文章 0 订阅

订阅专栏

Hive 自定 InputFormat 解析 Hadoop MR 产生的SequenceFile 文件
注意： Hive 使用的MRV1
1.定义InputFormat

package cn.gitv.bi.log.analysis.io;

import java.io.IOException;

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;

public class SequenceFileKeyInputFormat<K, V> extends FileInputFormat<K, V> {

    public SequenceFileKeyInputFormat() {
        setMinSplitSize(SequenceFile.SYNC_INTERVAL);
    }

    @SuppressWarnings("deprecation")
    @Override
    protected FileStatus[] listStatus(JobConf job) throws IOException {
        FileStatus[] files = super.listStatus(job);
        for (int i = 0; i < files.length; i++) {
            FileStatus file = files[i];
            if (file.isDir()) { // it's a MapFile
                Path dataFile = new Path(file.getPath(), MapFile.DATA_FILE_NAME);
                FileSystem fs = file.getPath().getFileSystem(job);
                // use the data file
                files[i] = fs.getFileStatus(dataFile);
            }
        }
        return files;
    }

    @SuppressWarnings("unchecked")
    public RecordReader<K, V> getRecordReader(InputSplit split, JobConf job,
            Reporter reporter) throws IOException {

        reporter.setStatus(split.toString());

        return (RecordReader<K, V>) new KVTextReader(job, (FileSplit) split);
    }
}

2.定义RecordReader

package cn.gitv.bi.log.analysis.io;

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.util.ReflectionUtils;

public abstract class SequenceFileKeyRecordReader<K, V> implements
        RecordReader<K, BytesWritable> {

    private SequenceFile.Reader in;
    private long start;
    private long end;
    private boolean more = true;
    protected Configuration conf;

    @SuppressWarnings("deprecation")
    public SequenceFileKeyRecordReader(Configuration conf, FileSplit split)
            throws IOException {
        Path path = split.getPath();
        FileSystem fs = path.getFileSystem(conf);
        this.in = new SequenceFile.Reader(fs, path, conf);
        this.end = split.getStart() + split.getLength();
        this.conf = conf;

        if (split.getStart() > in.getPosition())
            in.sync(split.getStart()); // sync to start

        this.start = in.getPosition();
        more = start < end;
    }

    @SuppressWarnings("rawtypes")
    public Class getKeyClass() {
        return in.getKeyClass();
    }

    @SuppressWarnings("rawtypes")
    public Class getValueClass() {
        return in.getValueClass();
    }

    @SuppressWarnings("unchecked")
    public K createKey() {
        return (K) ReflectionUtils.newInstance(getKeyClass(), conf);
    }

    public BytesWritable createValue() {
        return new BytesWritable();
    }

    public float getProgress() throws IOException {
        if (end == start) {
            return 0.0f;
        } else {
            return Math.min(1.0f, (in.getPosition() - start)
                    / (float) (end - start));
        }
    }

    public synchronized long getPos() throws IOException {
        return in.getPosition();
    }

    protected synchronized void seek(long pos) throws IOException {
        in.seek(pos);
    }

    public synchronized void close() throws IOException {
        in.close();
    }

    public boolean next(K key, BytesWritable value) throws IOException {
        if (!more)
            return false;

        long pos = in.getPosition();
        @SuppressWarnings("unchecked")
        V trueValue = (V) ReflectionUtils.newInstance(in.getValueClass(), conf);
        boolean remaining = in.next((Writable) key, (Writable) trueValue);
        if (remaining)
            combineKeyValue(key, trueValue, value);
        if (pos >= end && in.syncSeen()) {
            more = false;
        } else {
            more = remaining;
        }
        return more;
    }

    protected abstract void combineKeyValue(K key, V trueValue,
            BytesWritable newValue);

    protected void combineKeyValue(Text key, LongWritable trueValue,
            BytesWritable newValue) {
        // TODO I think we need to use straight bytes--I'm not sure this works?
        /*
         * StringBuilder builder = new StringBuilder(); builder.append(key);
         * builder.append('\001'); builder.append(trueValue.get());
         * newValue.set(new BytesWritable(builder.toString().getBytes()) );
         */
    }
}

3.定义RecordReader的读取规则，此类继承上述定义的RecorderReader

package cn.gitv.bi.log.analysis.io;

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;

public class KVTextReader extends SequenceFileKeyRecordReader<Text, Text> {

    public KVTextReader(Configuration conf, FileSplit split) throws IOException {
        super(conf, split);

    }

    @Override
    protected void combineKeyValue(Text key, Text trueValue,
            BytesWritable newValue) {
        StringBuilder builder = new StringBuilder();
        builder.append(key);
        builder.append('\001');
        builder.append(trueValue);
        newValue.set(new BytesWritable(builder.toString().getBytes()));

    }

    @Override
    public BytesWritable createValue() {
        return new BytesWritable();
    }

}