本博文是我学习《Hadoop权威指南》第4章的笔记,主要是里面范例程序的实现,部分实现有修改
1、压缩
1.1 标准输入压缩后输出
新建类StreamCompressor
package com.tuan.hadoopLearn.io;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionOutputStream;
import org.apache.hadoop.util.ReflectionUtils;
import java.io.IOException;
public class StreamCompressor {
public static void main(String[] args) {
String codeClassName = args[0];
Class<?> codeClass = null;
try {
codeClass = Class.forName(codeClassName);
} catch (ClassNotFoundException e) {
e.printStackTrace();
}
CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(codeClass, new Configuration());
CompressionOutputStream out = null;
try {
out = codec.createOutputStream(System.out);
IOUtils.copyBytes(System.in, out, 4096, true);
out.finish();
} catch (IOException e) {
e.printStackTrace();
}
}
}
cmd命令,把标准输出保存成一个gz压缩文件,执行后目录下多出一个test.gz,解压后看到一个文本文件
echo Test | hadoop jar hadoopLearn-0.0.1-SNAPSHOT.jar com.tuan.hadoopLearn.io.StreamCompressor org.apache.hadoop.io.compress.GzipCodec > test.gz
1.2 压缩文件解压
新建类FileDecompressor
package com.tuan.hadoopLearn.io;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;
public class FileDecompressor {
public static void main(String[] args) {
String uri = args[0];
Configuration conf = new Configuration();
FileSystem fs = null;
try {
fs = FileSystem.get(URI.create(uri), conf);
} catch (IOException e) {
e.printStackTrace();
}
Path path = new Path(uri);
CompressionCodecFactory factory = new CompressionCodecFactory(conf);
CompressionCodec codec = factory.getCodec(path);
if (codec == null) {
System.out.println("No suitable codec");
System.exit(1);
}
String outputUri = CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
InputStream in = null;
OutputStream out = null;
try {
in = codec.createInputStream(fs.open(path));
out = fs.create(new Path(outputUri));
IOUtils.copyBytes(in, out, conf);
} catch (IOException e) {
e.printStackTrace();
} finally {
IOUtils.closeStream(in);
IOUtils.closeStream(out);
}
}
}
然后往hadoop集群上传一个gz压缩文件,执行cmd命令
hadoop jar hadoopLearn-0.0.1-SNAPSHOT.jar com.tuan.hadoopLearn.io.FileDecompressor hdfs:/test/test.gz
然后就华丽丽地出了错,我是通过xshell把电脑桌面的一个gz文件传到我的服务器,然后再上传到hadoop集群,出现这个错误后,我试图解压服务器上的gz文件,结果提示说“not in gzip format”。查了一下资料,因为是rz命令默认通过ASCII传输,某些压缩文件传输会出问题,改为二进制传输就可以解决问题,命令为rz -be(https://www.jianshu.com/p/489dfea6d652)
照着试了一下,结果文件提示传输失败,根本传不上去了,懵逼了很久才发现,哦对了原来的test.gz还没删掉呢,删掉之前的test.gz,用rz -be重新上传,再传到hadoop集群,最后执行命令,解压成功
1.3 使用压缩池
新建类PooledStreamCompressor
package com.tuan.hadoopLearn.io;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.compress.CodecPool;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionOutputStream;
import org.apache.hadoop.io.compress.Compressor;
import org.apache.hadoop.util.ReflectionUtils;
import java.io.IOException;
public class PooledStreamCompressor {
public static void main(String[] args) {
String codecClassName = args[0];
Class<?> codecClass = null;
try {
codecClass = Class.forName(codecClassName);
} catch (ClassNotFoundException e) {
e.printStackTrace();
}
Configuration conf = new Configuration();
CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
Compressor compressor = CodecPool.getCompressor(codec, conf);
try {
CompressionOutputStream out = codec.createOutputStream(System.out, compressor);
IOUtils.copyBytes(System.in, out, 4096, true);
} catch (IOException e) {
e.printStackTrace();
}
}
}
cmd命令
echo hello | hadoop jar hadoopLearn-0.0.1-SNAPSHOT.jar com.tuan.hadoopLearn.io.PooledStreamCompressor org.apache.hadoop.io.compress.GzipCodec > hello.gz
1.4 MapReduce中使用压缩
用刚才写的StreamCompressor把input.txt压缩一下,传到集群上
type input.txt | hadoop jar hadoopLearn-0.0.1-SNAPSHOT.jar com.tuan.hadoopLearn.io.StreamCompressor org.apache.hadoop.io.compress.GzipCodec > input.gz
新建类MaxTemperatureWithCompression
package com.tuan.hadoopLearn.mapreduce;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class MaxTemperatureWithCompression {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
if (args.length != 2) {
System.err.println("Usage: MaxTemperature <input path> <output path");
System.exit(1);
}
Job job = new Job();
job.setJarByClass(MaxTemperatureWithCompression.class);
job.setJobName("Max Temperature");
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
FileOutputFormat.setCompressOutput(job, true);
FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
job.setMapperClass(MaxTemperatureMapper.class);
job.setReducerClass(MaxTemperatureReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
cmd命令
hadoop jar hadoopLearn-0.0.1-SNAPSHOT.jar com.tuan.hadoopLearn.mapreduce.MaxTemperatureWithCompression /mapreduce/input.gz /mapreduce/output.gz
最后的输出文件夹里面结构是这样
2 流
2.1 Writable
随便写了一个测试类
package com.tuan.hadoopLearn.io;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.StringUtils;
import org.junit.jupiter.api.Test;
import java.io.*;
import static org.junit.jupiter.api.Assertions.assertEquals;
class WritableTest{
private static IntWritable writable = new IntWritable(163);
private static byte[] serialize (Writable writable) throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
DataOutputStream dataOut = new DataOutputStream(out);
writable.write(dataOut);
dataOut.close();
return out.toByteArray();
}
private static Writable deserialize(byte[] bytes) throws IOException {
ByteArrayInputStream in = new ByteArrayInputStream(bytes);
DataInputStream dataIn = new DataInputStream(in);
writable.readFields(dataIn);
dataIn.close();
return writable;
}
@Test
void serializeTest() throws IOException {
byte[] bytes = serialize(writable);
assertEquals(bytes.length, 4);
assertEquals(StringUtils.byteToHexString(bytes), "000000a3");
IntWritable deserializeWritable = (IntWritable) deserialize(bytes);
assertEquals(deserializeWritable.get(), 163);
}
}
2.2 Text迭代
新建类TextIterator
package com.tuan.hadoopLearn.io;
import org.apache.hadoop.io.ByteBufferPool;
import org.apache.hadoop.io.Text;
import java.nio.ByteBuffer;
public class TextIterator {
public static void main(String[] args) {
Text t = new Text("\u0041\u00DF\u6771\uD801\uDC00");
ByteBuffer buf = ByteBuffer.wrap(t.getBytes(), 0, t.getLength());
int cp;
while(buf.hasRemaining() && (cp = Text.bytesToCodePoint(buf)) != -1) {
System.out.println(Integer.toHexString(cp));
}
}
}
运行后输出
2.3 实现定制的Writable
没啥好说的,写了个主函数验证一下定制的Comparator
package com.tuan.hadoopLearn.io;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.io.WritableUtils;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class TextPair implements WritableComparable<TextPair> {
private Text first;
private Text second;
public TextPair() {
set(new Text(), new Text());
}
public TextPair(String first, String second) {
set(new Text(first), new Text(second));
}
public TextPair(Text first, Text second) {
set(first, second);
}
public void set(Text first, Text second) {
this.first = first;
this.second = second;
}
public Text getFirst() {
return first;
}
public Text getSecond() {
return second;
}
@Override
public void write(DataOutput out) throws IOException {
first.write(out);
second.write(out);
}
@Override
public void readFields(DataInput in) throws IOException {
first.readFields(in);
second.readFields(in);
}
@Override
public int compareTo(TextPair tp) {
int cmp = first.compareTo(tp.first);
if (cmp != 0) {
return cmp;
}
return second.compareTo(tp.second);
}
@Override
public String toString() {
return first + "\t" + second;
}
@Override
public boolean equals(Object o) {
if (o instanceof TextPair) {
TextPair tp = (TextPair) o;
return first.equals(tp.first) && second.equals(tp.second);
}
return false;
}
@Override
public int hashCode() {
return first.hashCode() * 163 + second.hashCode();
}
public static class Comparator extends WritableComparator {
private static final Text.Comparator TEXT_COMPARATOR = new Text.Comparator();
static {
WritableComparator.define(TextPair.class, new Comparator());
}
Comparator() {
super(TextPair.class);
}
@Override
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
int firstL1 = WritableUtils.decodeVIntSize(b1[s1]) + readInt(b1, s1);
int firstL2 = WritableUtils.decodeVIntSize(b2[s2]) + readInt(b2, s2);
int cmp = TEXT_COMPARATOR.compare(b1, s1, firstL1, b2, s2, firstL2);
if (cmp != 0) {
return cmp;
}
return TEXT_COMPARATOR.compare(b1, s1 + firstL1, l1 - firstL1, b2, s2 + firstL2, l2 - firstL2);
}
}
public static void main(String[] args) {
TextPair tp1 = new TextPair("Nothing", "True");
TextPair tp2 = new TextPair("Everything", "Permitted");
WritableComparator comparator = WritableComparator.get(TextPair.class);
System.out.println(comparator.compare(tp1, tp2));
}
}
3 Avro
3.1 Avro数据读写
Resources目录下(用的Idea)新建一个叫StringPair.avsc的文件,新建一个StringPair对象存入缓冲区再读出打印在控制台
{
"type": "record",
"name": "StringPair",
"doc": "A pair of strings.",
"fields": [
{"name": "left", "type": "string"},
{"name": "right", "type": "string"}
]
}
package com.tuan.hadoopLearn.io;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.*;
import org.apache.avro.util.Utf8;
import org.apache.hadoop.io.IOUtils;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
public class AvroString {
public void stringProcess() throws IOException {
Schema.Parser parser = new Schema .Parser();
Schema schema = parser.parse(getClass().getResourceAsStream("/StringPair.avsc"));
GenericRecord datum = new GenericData.Record(schema);
datum.put("left", new Utf8("Work in the dark"));
datum.put("right", new Utf8("To serve the light"));
ByteArrayOutputStream out = new ByteArrayOutputStream();
DatumWriter<GenericRecord> writer = new GenericDatumWriter<GenericRecord>(schema);
Encoder encoder = EncoderFactory.get().binaryEncoder(out, null);
writer.write(datum, encoder);
encoder.flush();
DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(schema);
Decoder decoder = DecoderFactory.get().binaryDecoder(out.toByteArray(), null);
GenericRecord newDatum = reader.read(null, decoder);
System.out.println(newDatum);
}
public static void main(String[] args) {
try {
new AvroString().stringProcess();
} catch (IOException e) {
e.printStackTrace();
}
}
}
运行后
3.2 文件读写
同样,新建一个datum,存入文件后再重新读出,并在控制台打印
package com.tuan.hadoopLearn.io;
import com.sun.tools.javah.Gen;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.DatumReader;
import org.apache.avro.io.DatumWriter;
import org.apache.avro.util.Utf8;
import sun.net.www.content.text.Generic;
import java.io.File;
import java.io.IOException;
public class AvroFile {
public void fileProcess() throws IOException {
Schema.Parser parser = new Schema .Parser();
Schema schema = parser.parse(getClass().getResourceAsStream("/StringPair.avsc"));
GenericRecord datum = new GenericData.Record(schema);
datum.put("left", new Utf8("Work in the dark"));
datum.put("right", new Utf8("To serve the light"));
DatumWriter<GenericRecord> writer = new GenericDatumWriter<>();
DataFileWriter<GenericRecord> fileWriter = new DataFileWriter<GenericRecord>(writer);
File file = new File("data.avro");
fileWriter.create(schema, file);
fileWriter.append(datum);
fileWriter.close();
DatumReader<GenericRecord> reader = new GenericDatumReader<>();
DataFileReader<GenericRecord> fileReader = new DataFileReader<GenericRecord>(file, reader);
GenericRecord newDatum = fileReader.next();
System.out.println(newDatum);
}
public static void main(String[] args) {
try {
new AvroFile().fileProcess();
} catch (IOException e) {
e.printStackTrace();
}
}
}
运行后
生成的avro文件
3.3 Avro格式版最高气温
用avro格式作为mapreduce任务的输入输出格式,这里我踩了很多坑,花了两天时间才跑通,详细踩坑经过见我另一篇博文https://mp.csdn.net/postedit/81184615
首先新建一个TemperaturePair.avsc文件,后缀一定要对,我第一次就是因为后缀不对编译一直失败
{
"type": "record",
"name": "TemperaturePair",
"doc": "A weather reading.",
"fields": [
{"name": "year", "type": "int"},
{"name": "temperature", "type": "int"}
]
}
在pom.xml中添加avro插件,其中sourceDirectory指向avsc文件目录,outputDirectory指向生成的类目录
<plugin>
<groupId>org.apache.avro</groupId>
<artifactId>avro-maven-plugin</artifactId>
<version>1.7.7</version>
<executions>
<execution>
<phase>generate-sources</phase>
<goals>
<goal>schema</goal>
</goals>
<configuration>
<sourceDirectory>${project.basedir}/src/main/Resources/</sourceDirectory>
<outputDirectory>${project.basedir}/src/main/java/</outputDirectory>
</configuration>
</execution>
</executions>
</plugin>
新建Schema文件TemperaturePair.avsc
{
"namespace": "com.tuan.hadoopLearn.avro",
"type": "record",
"name": "TemperaturePair",
"doc": "A weather reading.",
"fields": [
{"name": "year", "type": "int"},
{"name": "temperature", "type": "int"}
]
}
配置好后,maven compile一下,在outputDirectory下生成类TemperaturePair
新建类CreateAvroInput用来生成MapReduce处理的输入文件input.avro,当然也可以先写一个json再用avro-tools转,我觉得麻烦,学习嘛,一切从简。生成完了把input.avro传到集群
package com.tuan.hadoopLearn.io;
import com.tuan.hadoopLearn.avro.TemperaturePair;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.io.DatumWriter;
import org.apache.avro.specific.SpecificDatumWriter;
import java.io.File;
import java.io.IOException;
import static org.apache.hadoop.util.ThreadUtil.getResourceAsStream;
public class CreateAvroInput {
public void create() throws IOException {
DatumWriter<TemperaturePair> TemperaturePairDatumWriter = new SpecificDatumWriter<TemperaturePair>(TemperaturePair.class);
DataFileWriter<TemperaturePair> dataFileWriter = new DataFileWriter<TemperaturePair>(TemperaturePairDatumWriter);
Schema.Parser parser = new Schema.Parser();
Schema schema = parser.parse(getClass().getResourceAsStream("/TemperaturePair.avsc"));
dataFileWriter.create(schema, new File("input.avro"));
dataFileWriter.append(new TemperaturePair(1993, 87));
dataFileWriter.append(new TemperaturePair(1993, 25));
dataFileWriter.append(new TemperaturePair(1992, 37));
dataFileWriter.append(new TemperaturePair(1995, 74));
dataFileWriter.append(new TemperaturePair(1992, 38));
dataFileWriter.append(new TemperaturePair(1993, 103));
dataFileWriter.close();
}
public static void main(String[] args) throws IOException {
new CreateAvroInput().create();
}
}
然后就是MapReduce的主程序
package com.tuan.hadoopLearn.mapreduce;
import java.io.IOException;
import com.tuan.hadoopLearn.avro.TemperaturePair;
import com.tuan.hadoopLearn.utils.JarUtils;
import org.apache.avro.Schema;
import org.apache.avro.mapred.AvroKey;
import org.apache.avro.mapred.AvroValue;
import org.apache.avro.mapreduce.AvroJob;
import org.apache.avro.mapreduce.AvroKeyInputFormat;
import org.apache.avro.mapreduce.AvroKeyValueOutputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class AvroMaxTemperature extends Configured implements Tool {
public static class AvroMaxTemperatureMapper extends
Mapper<AvroKey<TemperaturePair>, NullWritable, IntWritable, IntWritable> {
@Override
public void map(AvroKey<TemperaturePair> key, NullWritable value, Context context)
throws IOException, InterruptedException {
Integer year = key.datum().getYear();
Integer temperature = key.datum().getTemperature();
context.write(new IntWritable(year), new IntWritable(temperature));
}
}
public static class AvroMaxTemperatureReducer extends
Reducer<IntWritable, IntWritable, AvroKey<Integer>, AvroValue<Integer>> {
@Override
public void reduce(IntWritable key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
Integer max = 0;
for (IntWritable value : values) {
max = Math.max(max, value.get());
}
context.write(new AvroKey<Integer>(key.get()), new AvroValue<Integer>(max));
}
}
public int run(String[] args) throws Exception {
if (args.length != 2) {
System.err.println("Usage: MapReduceMaxTemperature <input path> <output path>");
return -1;
}
Configuration conf = getConf();
JarUtils.addTmpJar( "C:/Software/Hadoop-3.0.3/lib/avro/avro-mapred-1.7.7-hadoop2.jar", conf);
Job job = new Job(conf);
job.setJarByClass(AvroMaxTemperature.class);
job.setJobName("Avro Max Temperature");
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setInputFormatClass(AvroKeyInputFormat.class);
job.setMapperClass(AvroMaxTemperatureMapper.class);
AvroJob.setInputKeySchema(job, TemperaturePair.getClassSchema());
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputFormatClass(AvroKeyValueOutputFormat.class);
job.setReducerClass(AvroMaxTemperatureReducer.class);
AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.INT));
AvroJob.setOutputValueSchema(job, Schema.create(Schema.Type.INT));
return (job.waitForCompletion(true) ? 0 : 1);
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new AvroMaxTemperature(), args);
System.exit(res);
}
}
cmd命令
hadoop jar hadoopLearn-0.0.1-SNAPSHOT.jar com.tuan.hadoopLearn.mapreduce.AvroMaxTemperature /mapreduce/input.avro /mapreduce/avroOutput
把输出扒拉下来,用avro-tool打开
java -jar avro-tools-1.7.7.jar tojson avroOutput/part-r-00000.avro
4 SequenceFile文件的读写
新建一个类SequenceFileDemo,写一个SequenceFile再读取打印在控制台
package com.tuan.hadoopLearn.io;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import java.io.IOException;
import java.net.URI;
public class SequenceFileDemo {
private static final String[] DATA = {
"Jingle bells, jingle bells",
"Jingle all the way",
"Oh! what fun it is to ride",
"In a one-horse open sleigh"
};
public void write(String uri) throws IOException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(uri), conf);
Path path = new Path(uri);
IntWritable key = new IntWritable();
Text value = new Text();
SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, path, key.getClass(), value.getClass());
for (int i = 0; i < 10; i ++) {
key.set(10 - i);
value.set(DATA[i % 4]);
writer.append(key, value);
}
IOUtils.closeStream(writer);
}
public void read(String uri) throws IOException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(URI.create(uri), conf);
Path path = new Path(uri);
SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
IntWritable key = new IntWritable();
Text value = new Text();
while (reader.next(key, value)) {
System.out.println(key + " : " + value);
}
}
public static void main(String[] args) throws IOException {
String uri = args[0];
SequenceFileDemo demo = new SequenceFileDemo();
demo.write(uri);
demo.read(uri);
}
}
cmd命令
hadoop jar hadoopLearn-0.0.1-SNAPSHOT.jar com.tuan.hadoopLearn.io.SequenceFileDemo hdfs:/SequenceFile/sequence