@羲凡——只为了更好的活着
Java 压缩HDFS上已有文件
hdfs上的文件太大,占用太多磁盘,为了节省磁盘,将历史的一些文件压缩。下面字节上代码。
package aaronJava;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.CompressionOutputStream;
import org.apache.hadoop.util.ReflectionUtils;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;
import java.util.Date;
public class CodecAaron {
//压缩文件BZip2Codec、GzipCodec、Lz4Codec、SnappyCodec
public static void compress(String inpath, String codecClassName, String output) throws Exception {
Class<?> codecClass = Class.forName(codecClassName);
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
//指定压缩文件路径
FSDataOutputStream outputStream = fs.create(new Path(output));
//指定要被压缩的文件路径
FSDataInputStream in = fs.open(new Path(inpath));
//创建压缩输出流
CompressionOutputStream out = codec.createOutputStream(outputStream);
IOUtils.copyBytes(in, out, conf);
IOUtils.closeStream(in);
IOUtils.closeStream(out);
}
//使用文件扩展名来推断而来的codec来对文件进行解压缩
public static void uncompress(String uri) throws IOException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(uri), conf);
Path inputPath = new Path(uri);
CompressionCodecFactory factory = new CompressionCodecFactory(conf);
CompressionCodec codec = factory.getCodec(inputPath);
if (codec == null) {
System.out.println("no codec found for " + uri);
System.exit(1);
}
String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
InputStream in = null;
OutputStream out = null;
try {
in = codec.createInputStream(fs.open(inputPath));
out = fs.create(new Path(outputUri));
IOUtils.copyBytes(in, out, conf);
} finally {
IOUtils.closeStream(out);
IOUtils.closeStream(in);
}
}
public static void main(String[] args) throws Exception {
System.out.println("=============[" + new Date() + "]"+args[0]+" begin !!!"+"=============");
/*args = new String[]{
"compress","hdfs://ns/wordcount/input/wc20180705",
"BZip2Codec","hdfs://ns/wordcount/input/wc20180705.bz2"
};*/
if (args[0].equals("compress")) {
compress(args[1], "org.apache.hadoop.io.compress." + args[2],args[3]);
} else if (args[0].equals("uncompress"))
uncompress(args[1]);
else {
System.err.println("Error!\n usgae: yarn jar CodecAaron.jar aaronJava.CodecAaron" +
" [compress] [filename] [compress type] [outpath]");
System.err.println("\t\ror [uncompress] [filename] ");
return;
}
System.out.println("=============[" + new Date() + "]"+args[0]+" over !!!"+"=============");
}
}
打包成jar包(CodecAaron.jar)
制作脚本 compress-bz2.sh
#!/bin/bash
input=$1
Folder=/Data/$input
ResFolder=/DataCompress/$input
hdfs dfs -ls $Folder | awk '{print $5 "," $8}' > files0.txt
cat ./files0.txt | while read line
do
line1=${line#*,}
filename=${line1##*/}
if [[ ${line%%,*} > 0 ]]
then
echo ${filename}
resultName=${filename}".bz2"
yarn jar ./CodecAaron.jar aaronJava.CodecAaron compress $line1 BZip2Codec $ResFolder/$resultName
echo "-----------------"$resultName" compress over---------------------"
fi
done
echo "================================"
echo "==========all over!!!==========="
echo "================================"
执行shell脚本
bash compress-bz2.sh test/20180305
====================================================================
@羲凡——只为了更好的活着
若对博客中有任何问题,欢迎留言交流