在hadoop中有两个地方需要用到压缩:其一是在HDFS上存储数据,节省存储空间;其二是集群间的通讯需要数据压缩,提高带宽的利用率。在java中一切输入输出都是以流的方式进行。一个可以读取字节序列的对象叫输入流,一个可以写入字节序列的对象叫输出流,如文件,网络连接、内存等都可以是输入和输出流。
1、从文件到文件的压缩
package com.kevin.hadoop;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.fs.Path;
public class CprsF2F {
public static void main(String[] args)throws Exception{
if(args.length != 3){
System.err.println("Usage: CprsF2F cmps_name_src_target");
System.exit(2);
}
Class<?> codecClass = Class.forName(args[0]);//未知类
Configuration conf = new Configuration();
CompressionCodec codec = (CompressionCodec)ReflectionUtils.newInstance(codecClass, conf);
InputStream in = null;
OutputStream out = null;
FileSystem fs = FileSystem.get(URI.create(args[1]), conf);
try{
in = fs.open(new Path(args[1]));
out = codec.createOutputStream(fs.create(new Path(args[2])));
IOUtils.copyBytes(in, out, conf);
}finally{
IOUtils.closeStream(in);
IOUtils.closeStream(out);
}
}
}
重要的两个类ReflectionUtils和CompressionCodec
2、从标准输入到压缩文件package com.kevin.hadoop;
import java.io.OutputStream;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.util.ReflectionUtils;
public class CprsIn2F {
public static void main(String[] args)throws Exception{
if(args.length != 2)
{
System.err.println("Usage:CprsIn2F cmps_name_target");
System.exit(2);
}
Class<?> codecClass = Class.forName(args[0]);
Configuration conf = new Configuration();
CompressionCodec codec = (CompressionCodec)ReflectionUtils.newInstance(codecClass, conf);
OutputStream out = null;
FileSystem fs = FileSystem.get(URI.create(args[1]),conf);
try{
out = codec.createOutputStream(fs.create(new Path(args[1])));
IOUtils.copyBytes(System.in, out, 4096, false);
}finally{
IOUtils.closeStream(out);
}
}
}
3、文件到文件的解压
package com.kevin.DcprsF2F;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.util.ReflectionUtils;
public class DcprsF2F {
public static void main(String[] args)throws Exception{
if(args.length != 3)
{
System.err.println("Usage: DcprsF2F cmps_name_src_target");
System.exit(2);
}
Class<?> codecClass = Class.forName(args[0]);
Configuration conf = new Configuration();
CompressionCodec codec = (CompressionCodec)ReflectionUtils.newInstance(codecClass, conf);
InputStream in = null;
OutputStream out = null;
FileSystem fs = FileSystem.get(URI.create(args[1]),conf);
try{
in = codec.createInputStream(fs.open(new Path(args[1])),codec.createDecompressor());
out = fs.create(new Path(args[2]));
IOUtils.copyBytes(in, out, conf);
}finally{
IOUtils.closeStream(in);
IOUtils.closeStream(out);
}
}
}