Java 压缩HDFS上已有文件

@羲凡——只为了更好的活着

Java 压缩HDFS上已有文件

hdfs上的文件太大,占用太多磁盘,为了节省磁盘,将历史的一些文件压缩。下面字节上代码。

package aaronJava;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.CompressionOutputStream;
import org.apache.hadoop.util.ReflectionUtils;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;
import java.util.Date;

public class CodecAaron {
    //压缩文件BZip2Codec、GzipCodec、Lz4Codec、SnappyCodec
    public static void compress(String inpath, String codecClassName, String output) throws Exception {
        Class<?> codecClass = Class.forName(codecClassName);
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);
        CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
        //指定压缩文件路径
        FSDataOutputStream outputStream = fs.create(new Path(output));
        //指定要被压缩的文件路径
        FSDataInputStream in = fs.open(new Path(inpath));
        //创建压缩输出流
        CompressionOutputStream out = codec.createOutputStream(outputStream);
        IOUtils.copyBytes(in, out, conf);
        IOUtils.closeStream(in);
        IOUtils.closeStream(out);
    }

    //使用文件扩展名来推断而来的codec来对文件进行解压缩
    public static void uncompress(String uri) throws IOException {
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(URI.create(uri), conf);

        Path inputPath = new Path(uri);
        CompressionCodecFactory factory = new CompressionCodecFactory(conf);
        CompressionCodec codec = factory.getCodec(inputPath);
        if (codec == null) {
            System.out.println("no codec found for " + uri);
            System.exit(1);
        }
        String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
        InputStream in = null;
        OutputStream out = null;
        try {
            in = codec.createInputStream(fs.open(inputPath));
            out = fs.create(new Path(outputUri));
            IOUtils.copyBytes(in, out, conf);
        } finally {
            IOUtils.closeStream(out);
            IOUtils.closeStream(in);
        }
    }

    public static void main(String[] args) throws Exception {
        System.out.println("=============[" + new Date() + "]"+args[0]+" begin !!!"+"=============");
        /*args = new String[]{
                "compress","hdfs://ns/wordcount/input/wc20180705",
                "BZip2Codec","hdfs://ns/wordcount/input/wc20180705.bz2"
        };*/
        if (args[0].equals("compress")) {
            compress(args[1], "org.apache.hadoop.io.compress." + args[2],args[3]);
        } else if (args[0].equals("uncompress"))
            uncompress(args[1]);
        else {
            System.err.println("Error!\n usgae: yarn jar CodecAaron.jar aaronJava.CodecAaron" +
                    " [compress] [filename] [compress type] [outpath]");
            System.err.println("\t\ror [uncompress] [filename] ");
            return;
        }
        System.out.println("=============[" + new Date() + "]"+args[0]+" over !!!"+"=============");
    }
}

打包成jar包(CodecAaron.jar)
制作脚本 compress-bz2.sh

#!/bin/bash
input=$1
Folder=/Data/$input
ResFolder=/DataCompress/$input
hdfs dfs -ls $Folder | awk '{print $5 "," $8}' > files0.txt
cat  ./files0.txt | while read line
do
	line1=${line#*,}
	filename=${line1##*/}
	if [[  ${line%%,*} > 0 ]]
	then 
		echo ${filename}
		resultName=${filename}".bz2"
		yarn jar ./CodecAaron.jar aaronJava.CodecAaron compress $line1 BZip2Codec $ResFolder/$resultName
		echo "-----------------"$resultName" compress over---------------------"
	fi
done
echo "================================"
echo "==========all over!!!==========="
echo "================================"

执行shell脚本

bash compress-bz2.sh test/20180305

====================================================================

@羲凡——只为了更好的活着

若对博客中有任何问题,欢迎留言交流

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值