1 数据流的压缩和解压缩
CompressionCodec 有两个方法可以用于轻松地压缩或解压缩数据。要想对正在被写入一个输出流的数据进行压缩,我们可以使用createOutputStream(OutputStreamout)方法创建一个 CompressionOutputStream,将其以压缩格式写入底层的流。相反,要想对从输入流读取而来的数据进行解压缩,则调用 createInputStream(InputStreamin)函数,从而获得一个CompressionInputStream, 从而从底层的流读取未压缩的数据。
测试一下如下压缩方式:
DEFLATE org.apache.hadoop.io.compress.DefaultCodec
gzip org.apache.hadoop.io.compress.GzipCodec
bzip2 org.apache.hadoop.io.compress.BZip2Codec
测试类:
package com.da.compress;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.CompressionInputStream;
import org.apache.hadoop.io.compress.CompressionOutputStream;
import org.apache.hadoop.util.ReflectionUtils;
public class TestCompress {
public static void main(String[] args) throws Exception {
// 压缩测试
// compress("e:/hello.txt", "org.apache.hadoop.io.compress.BZip2Codec");
// compress("e:/hello.txt", "org.apache.hadoop.io.compress.GzipCodec");
// 解压测试
decompress("e:/hello.txt.bz2");
}
// 压缩
private static void compress(String fileName, String method) throws Exception {
// 获取输入流
FileInputStream fis = new FileInputStream(new File(fileName));
Class<?> className = Class.forName(method);
CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(className, new Configuration());
// 获取输出流
FileOutputStream fos = new FileOutputStream(new File(fileName + codec.getDefaultExtension()));
CompressionOutputStream cos = codec.createOutputStream(fos);
// 流的对拷
IOUtils.copyBytes(fis, cos, 1024 * 1024, false);
// 关闭资源
fis.close();
cos.close();
fos.close();
}
// 解压
private static void decompress(String fileName) throws Exception {
// 校验
CompressionCodecFactory factory = new CompressionCodecFactory(new Configuration());
CompressionCodec codec = factory.getCodec(new Path(fileName));
if (codec == null) {
System.out.println("不支持该解码" + fileName);
return;
}
// 获取输入流
CompressionInputStream cis = codec.createInputStream(new FileInputStream(new File(fileName)));
// 获取输出流
FileOutputStream fos = new FileOutputStream(new File(fileName + ".decode"));
// 流的对拷
IOUtils.copyBytes(cis, fos, 1024 * 1024, false);
// 关闭资源
cis.close();
fos.close();
}
}
2 Map 输出端采用压缩
即使你的 MapReduce 的输入输出文件都是未压缩的文件,你仍然可以对 map 任务的中间结果输出做压缩,因为它要写在硬盘并且通过网络传输到 reduce 节点,对其压缩可以提高很多性能,这些工作只要设置两个属性即可,我们来看下代码怎么设置:
1 给大家提供的 hadoop 源码支持的压缩格式有: BZip2Codec 、 DefaultCodec
package com.da.compress;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordcountDriver {
public static void main(String[] args) throws Exception {
args = new String[] { "e:/mrinput", "e:/mrout" };
// 1 获取job信息
Configuration configuration = new Configuration();
// 开启 map 端输出压缩
configuration.setBoolean("mapreduce.map.output.compress", true);
// 设置 map 端输出压缩方式
configuration.setClass("mapreduce.map.output.compress.codec", BZip2Codec.class, CompressionCodec.class);
Job job = Job.getInstance(configuration);
// 2 获取jar包位置
job.setJarByClass(WordcountDriver.class);
// 3 关联自定义的mappper和reducer
job.setMapperClass(WordcountMapper.class);
job.setReducerClass(WordCountReducer.class);
// 4 设置map输出数据类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 5 设置最终输出数据类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 6 设置数据输入和输出文件路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 7 提交代码
// job.submit();
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}
2 Mapper 保持不变
package com.da.compress;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class WordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private Text k = new Text();
private IntWritable v = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 1 一行内容转换成string
String line = value.toString();
// 2 切割
String[] words = line.split(" ");
// 3 循环写出到下一个阶段
for (String word : words) {
k.set(word);
context.write(k, v);
}
}
}
3 Reducer 保持不变
package com.da.compress;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable v = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
// 1 统计单词总个数
int sum = 0;
for (IntWritable count : values) {
sum += count.get();
}
// 2 输出单词总个数
v.set(sum);
context.write(key, v);
}
}
3 Reduce 输出端采用压缩
基于 workcount 案例处理
1 修改驱动
package com.da.compress;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordcountDriver {
public static void main(String[] args) throws Exception {
args = new String[] { "e:/mrinput", "e:/mrout" };
// 1 获取job信息
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
// 2 获取jar包位置
job.setJarByClass(WordcountDriver.class);
// 3 关联自定义的mappper和reducer
job.setMapperClass(WordcountMapper.class);
job.setReducerClass(WordCountReducer.class);
// 4 设置map输出数据类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 5 设置最终输出数据类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 6 设置数据输入和输出文件路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 设置 reduce 端输出压缩开启
FileOutputFormat.setCompressOutput(job, true);
// 设置压缩的方式
FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class);
// FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
// FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
// 7 提交代码
// job.submit();
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}
2 Mapper 和 Reducer 保持不变