CompressionCodec有两个方法可以用于轻松地压缩或解压缩数据。要想对正在被写入一个输出流的数据进行压缩,我们可以使用createOutputStream(OutputStreamout)方法创建一个CompressionOutputStream,将其以压缩格式写入底层的流。相反,要想对从输入流读取而来的数据进行解压缩,则调用createInputStream(InputStream)函数,从而获得一个CompressionInputStream,从而从底层的流读取未压缩的数据。
测试一下如下压缩方式:
DEFLATE | org.apache.hadoop.io.compress.DefaultCodec |
gzip | org.apache.hadoop.io.compress.GzipCodec |
bzip2 | org.apache.hadoop.io.compress.BZip2Codec |
package com.itstar.mr.wc0908.compress;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.CompressionInputStream;
import org.apache.hadoop.io.compress.CompressionOutputStream;
import org.apache.hadoop.util.ReflectionUtils;
import java.io.*;
/**
* @author 李庆
* 2019-11-13
*/
public class CompressText {
/**
* 压缩
* 使用createOutputStream(OutputStream)方法创建一个CompressionOutputStream
* */
public static void compress(String inputfile,String method,String outputfile) throws IOException, ClassNotFoundException {
//1.获取文档数据
FileInputStream fis = new FileInputStream(new File(inputfile));
//2.获取编码器/解码器
Class cordClass = Class.forName(method);
//3.获取压缩输出流CompressionOutputStream,在CompressionCode中
CompressionCodec cordc = (CompressionCodec) ReflectionUtils.newInstance(cordClass, new Configuration());
CompressionOutputStream fos = cordc.createOutputStream(new FileOutputStream(new File(outputfile+cordc.getDefaultExtension())));
//4.使用IO工具类进行输入流和输出流对接
IOUtils.copyBytes(fis,fos,5*1024*1024);
//5.关流
IOUtils.closeStream(fis);
IOUtils.closeStream(fos);
}
/**
* 解压
* 调用createInputStream(InputStream)函数,从而获得一个CompressionInputStream
* @param inputfile 输入路径
* @param outputfile 输出路径
* @param decoded 扩展名|后缀名
* */
public static void discompression(String inputfile,String outputfile,String decode) throws IOException {
//1.获取编解码器对象CompressionCodec
CompressionCodecFactory factory = new CompressionCodecFactory(new Configuration());
CompressionCodec codec = factory.getCodec(new Path(inputfile));
//2.解压的输入流CompressionInputStream
CompressionInputStream cis = codec.createInputStream(new FileInputStream(new File(inputfile)));
//3.输出FileOutputStream
FileOutputStream fos = new FileOutputStream(new File(outputfile+"."+decode));
//4.对接流
IOUtils.copyBytes(cis,fos,5*1024*1024);
//5.关流
IOUtils.closeStream(cis);
IOUtils.closeStream(fos);
}
public static void main(String[] args) throws IOException, ClassNotFoundException {
//compress("D:\\大数据中高期\\08-19压缩\\phone.txt","org.apache.hadoop.io.compress.GzipCodec","D:\\大数据中高期\\08-19压缩\\phone_OUT11");
discompression("D:\\大数据\\08-19压缩\\phone_OUT1.bz2","D:\\大数据\\08-19压缩\\phone111","txt");
}
}
下面是运行的结果:
bzip2压缩率高,但压缩时间久