将flink输出到hdfs的数据压缩成gzip格式

BaseRow.class

import java.io.Serializable;

/**
 * 里面保存的要输出的分区目录和数据
 */
public class BaseRow implements Serializable {
    /**
     * 分区目录
     */
    private String partPath;
    /**
     * 输出数据
     */
    private String result;

    public BaseRow() {
    }

    public BaseRow(String partPath, String result) {
        this.partPath = partPath;
        this.result = result;
    }

    public String getPartPath() {
        return partPath;
    }

    public void setPartPath(String partPath) {
        this.partPath = partPath;
    }

    public String getResult() {
        return result;
    }

    public void setResult(String result) {
        this.result = result;
    }

    @Override
    public String toString() {
        return "BaseRow{" +
                "partPath='" + partPath + '\'' +
                ", result='" + result + '\'' +
                '}';
    }
}

CompressionOutputStreamWrapper.class

import org.apache.hadoop.io.compress.CompressionOutputStream;

import java.io.Serializable;

public class CompressionOutputStreamWrapper implements Serializable {
    private CompressionOutputStream compressionOutputStream;
    private long pos;

    public CompressionOutputStreamWrapper() {
    }

    public CompressionOutputStreamWrapper(CompressionOutputStream compressionOutputStream, long pos) {
        this.compressionOutputStream = compressionOutputStream;
        this.pos = pos;
    }

    public CompressionOutputStream getCompressionOutputStream() {
        return compressionOutputStream;
    }

    public void setCompressionOutputStream(CompressionOutputStream compressionOutputStream) {
        this.compressionOutputStream = compressionOutputStream;
    }

    public long getPos() {
        return pos;
    }

    public void setPos(long pos) {
        this.pos = pos;
    }

    @Override
    public String toString() {
        return "CompressionOutputStreamWrapper{" +
                "compressionOutputStream=" + compressionOutputStream +
                ", pos=" + pos +
                '}';
    }
}

MyStreamWriterBase.class

import org.apache.flink.streaming.connectors.fs.Writer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionOutputStream;
import org.apache.hadoop.util.ReflectionUtils;

import java.io.IOException;

public abstract class MyStreamWriterBase<T> implements Writer<T> {

    private static final long serialVersionUID = 2L;

    /**
     * The {@code FSDataOutputStream} for the current part file.
     */
    private transient FSDataOutputStream outStream;
    //
    private transient CompressionOutputStream compressionOutputStream;

    private transient CompressionOutputStreamWrapper compressionOutputStreamWrapper;

    private boolean syncOnFlush;

    private String compressionCodec;

    public MyStreamWriterBase() {
    }

    public MyStreamWriterBase(String compressionCodec) {
        this.compressionCodec = compressionCodec;
    }

    protected MyStreamWriterBase(MyStreamWriterBase<T> other) {
        this.syncOnFlush = other.syncOnFlush;
        this.compressionCodec = other.compressionCodec;
    }

    /**
     * Controls whether to sync {@link FSDataOutputStream} on flush.
     */
    public void setSyncOnFlush(boolean syncOnFlush) {
        this.syncOnFlush = syncOnFlush;
    }

    /**
     * Returns the current output stream, if the stream is open.
     * //
     */

    @Override
    public void open(FileSystem fs, Path path) throws IOException {

        if (outStream != null) {
            throw new IllegalStateException("Writer has already been opened");
        }

        outStream = fs.create(path, false);

        Class<?> codecClass = null;
        try {
            codecClass = Class.forName(compressionCodec);
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        }

        Configuration conf = fs.getConf();

        CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
        compressionOutputStream = codec.createOutputStream(outStream);

        compressionOutputStreamWrapper = new CompressionOutputStreamWrapper();
        compressionOutputStreamWrapper.setCompressionOutputStream(compressionOutputStream);
        compressionOutputStreamWrapper.setPos(0);
    }

    @Override
    public long flush() throws IOException {
        if (outStream == null) {
            throw new IllegalStateException("Writer is not open");
        }
        if (!syncOnFlush) {
            compressionOutputStream.flush();
        }

        return compressionOutputStreamWrapper.getPos();
    }

    @Override
    public long getPos() throws IOException {
        if (outStream == null) {
            throw new IllegalStateException("Writer is not open");
        }

        return compressionOutputStreamWrapper.getPos();
    }

    @Override
    public void close() throws IOException {


        if (compressionOutputStream != null) {
            flush();
            compressionOutputStream.close();
            compressionOutputStream = null;
        }

        if (outStream != null) {
            outStream.close();
            outStream = null;
        }
    }

    public boolean isSyncOnFlush() {
        return syncOnFlush;
    }


    protected CompressionOutputStream getCompressionStream() {
        if (compressionOutputStream == null) {
            throw new IllegalStateException("Output stream has not been opened");
        }
        return compressionOutputStream;
    }

    public CompressionOutputStreamWrapper getCompressionOutputStreamWrapper() {
        return compressionOutputStreamWrapper;
    }

    public void setCompressionOutputStreamWrapper(CompressionOutputStreamWrapper compressionOutputStreamWrapper) {
        this.compressionOutputStreamWrapper = compressionOutputStreamWrapper;
    }
}

MyStringWriter.class

import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.CompressionOutputStream;

import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;

public class MyStringWriter<T> extends MyStreamWriterBase<T> {
    private static final long serialVersionUID = 1L;

    private String charsetName;

    private transient Charset charset;

    /**
     * Creates a new {@code StringWriter} that uses {@code "UTF-8"} charset to convert
     * strings to bytes.
     */
    public MyStringWriter() {
        this("UTF-8");
    }

    public MyStringWriter(String compressionCodec, String charsetName) {
        super(compressionCodec);
        if(StringUtils.isBlank(charsetName)) {
            this.charsetName = "UTF-8";
        } else {
            this.charsetName = charsetName;
        }
    }

    /**
     * Creates a new {@code StringWriter} that uses the given charset to convert
     * strings to bytes.
     *
     * @param charsetName Name of the charset to be used, must be valid input for {@code Charset.forName(charsetName)}
     */
    public MyStringWriter(String charsetName) {
        this.charsetName = charsetName;
    }

    protected MyStringWriter(MyStringWriter<T> other) {
        super(other);
        this.charsetName = other.charsetName;
    }

    @Override
    public void open(FileSystem fs, Path path) throws IOException {
        super.open(fs, path);

        try {
            this.charset = Charset.forName(charsetName);
        } catch (IllegalCharsetNameException e) {
            throw new IOException("The charset " + charsetName + " is not valid.", e);
        } catch (UnsupportedCharsetException e) {
            throw new IOException("The charset " + charsetName + " is not supported.", e);
        }
    }

    @Override
    public void write(T element) throws IOException {
        BaseRow baseRow = (BaseRow) element;

        CompressionOutputStreamWrapper compressionOutputStreamWrapper = getCompressionOutputStreamWrapper();

        CompressionOutputStream outputStream = compressionOutputStreamWrapper.getCompressionOutputStream();
        byte[] bytes = baseRow.getResult().getBytes(charset);
        outputStream.write(bytes);
        outputStream.write('\n');
        long pos = compressionOutputStreamWrapper.getPos();
        pos += bytes.length + 1;
        compressionOutputStreamWrapper.setPos(pos);
    }

    @Override
    public MyStringWriter<T> duplicate() {
        return new MyStringWriter<>(this);
    }

    String getCharsetName() {
        return charsetName;
    }
}

使用方式

DataStream<BaseRow> dataStream;
        dataStream = rowDataStream.map((MapFunction<Row, BaseRow>) row -> {
            try {
                return new BaseRow(null, (String) row.getField(0));
            } catch (Exception ex) {
                return null;
            }
        });
sink.setWriter(new MyStringWriter("org.apache.hadoop.io.compress.GzipCodec", null));
dataStream.addSink(sink);
  • 2
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 5
    评论
要将文件HDFS并指定Gzip压缩格式,可以使用Flink提供的`org.apache.flink.core.fs.FileSystem`和`org.apache.flink.core.fs.Path`类来实现。具体步骤如下: 1. 创建一个`org.apache.flink.core.fs.FileSystem`对象,指定HDFS的URI和配置信息。 2. 创建一个`org.apache.flink.core.fs.Path`对象,指定HDFS文件路径。 3. 调用`FileSystem.create()`方法创建一个输出流。 4. 将数据输出流,这里可以使用`org.apache.flink.api.common.io.FileOutputFormat`类来实现Gzip压缩。 5. 关闭输出流。 下面是一个示例程序,它将数据HDFS并使用Gzip压缩: ```java import org.apache.flink.api.common.io.FileOutputFormat; import org.apache.flink.core.fs.FileSystem; import org.apache.flink.core.fs.Path; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; public class WriteToHdfsExample { public static void main(String[] args) throws Exception { // 创建执行环境 StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); // 准备要HDFS的数据 DataStream<String> data = ... // HDFS String outputPath = "hdfs://hadoop-master:9000/path/to/output"; FileSystem hdfs = FileSystem.get(new URI(outputPath), new Configuration()); Path path = new Path(outputPath); FSDataOutputStream outputStream = hdfs.create(path); GzipCompressor gzipCompressor = new GzipCompressor(); FileOutputFormat<String> fileOutputFormat = new TextOutputFormat<>(path, gzipCompressor); fileOutputFormat.setOutputFilePath(path); fileOutputFormat.setWriteMode(FileSystem.WriteMode.OVERWRITE); fileOutputFormat.open(outputStream); data.writeUsingOutputFormat(fileOutputFormat); fileOutputFormat.close(); // 启动任务执行 env.execute("Write to HDFS Example"); } } ``` 在以上示例程序中,`hadoop-master:9000`是HDFS的URI,`/path/to/output`是要入的文件路径。`TextOutputFormat`是Flink提供的一个文本输出格式,它支持Gzip压缩。在`FileOutputFormat`的构造函数中,将`TextOutputFormat`作为参数传入,即可实现Gzip压缩
评论 5
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值