作用
- 自定义文件读取
读取文件时,默认是使用读取器 LineRecoredReader<行首偏移量,每行内容>,每读取一次,把key和value传给 开发者开发的Mapper组件。现在自定义文件读取器,可以自定义读取文件的方法,这样就可以调整传递给Mapper组件的key和value。
- 自定义文件输出
当结果需要输出到文件时,默认使用 FileOutputFormat的子类TextOutputFormat。该类的作用是,k v的分隔符是Tab制表符,行之间的分隔符是 换行符。自定义可以输出文件时,进行输出结果的调整。
需求
有以下数据,需要传到Mapper的key和value满足以下要求:
- key是每行行号
- value是每行的内容
输出结果到文件时,使用自定义的格式输出
hello world
hello dong
hello hadoop
hello world
代码实现
package hadoop02;
import java.io.IOException;
import java.io.InputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.LineReader;
/**
* 此组件为格式读取器,决定如何读取文件数据
* hadoop的默认格式读取器:LineRecoredReader 默认:<行首偏移量,每行内容>
* initialize是组件的初始化方法,只会调用一次。作用:初始化切片对象,文件系统对象,行读取器对象
* nextKeyValue会被调用多次,直到返回false才会停止调用。作用是:通过LineReader处理文件内容,并初始化输入key和value
* getCurrentKey和getCurrentValue方法作用:将key和value传给Mapper组件。 即nextKeyValue调用一次,这两个方法也会被调用一次
* close方法最后做一些资源清理工作
* @author Administrator
*
*/
public class AuthRecordReader extends RecordReader<IntWritable, Text>{
private FileSplit fs;
private LineReader reader;
private IntWritable key;
private Text value;
//记录行号
private int count;
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
//初始化切片
fs = (FileSplit)split;
Path path = fs.getPath();
//获取环境对象
Configuration conf = context.getConfiguration();
//获取文件系统对象
FileSystem system = path.getFileSystem(conf);
//获取处理文件对应的输入流
InputStream in = system.open(path);
//初始化行读取器,可以一行一行处理数据
reader = new LineReader(in);
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
key = new IntWritable();
value = new Text();
Text tmp = new Text();
//调用一次,读取一行数据,传给tmp
//返回值表示,读取该行的长度
int length = reader.readLine(tmp);
if(length == 0) {
//当没有数据可读时,则终止 nextKeyValue 方法的调用
return false;
}else {
count++;
key.set(count); //行号
value.set(tmp); //每行内容
return true;
}
}
@Override
public IntWritable getCurrentKey() throws IOException, InterruptedException {
return key;
}
@Override
public Text getCurrentValue() throws IOException, InterruptedException {
return value;
}
@Override
public float getProgress() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return 0;
}
@Override
public void close() throws IOException {
if(reader != null) {
reader = null;
}
}
}
package hadoop02;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
/**
* 自定义格式输入组件,用于定于Mapper的输入key和value类型
* 需要集成FileInputFormat抽象类
* @author Administrator
*
*/
public class AuthInputFomat extends FileInputFormat<IntWritable, Text>{
/**
* RecordReader对象决定了如何处理文件,以及将输入key和value传给Mapper组件
*/
@Override
public RecordReader<IntWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
//返回一个自定义格式的RecordReader
return new AuthRecordReader();
}
}
package hadoop02;
import java.io.IOException;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
public class AuthRecordWriter<K,V> extends RecordWriter<K, V>{
FSDataOutputStream out;
public AuthRecordWriter(FSDataOutputStream out) {
//输出流初始化
this.out = out;
}
@Override
public void close(TaskAttemptContext arg0) throws IOException, InterruptedException {
if(out != null) {
out.close();
}
}
@Override
public void write(K key, V value) throws IOException, InterruptedException {
out.write(key.toString().getBytes());
//key和value的间隔符
out.write("$$".getBytes());
out.write(value.toString().getBytes());
//每行的间隔符
out.write("\r\n".getBytes());
}
}
package hadoop02;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* 自定义格式输出组件,用于更改输出结果文件的格式
* 开发时用泛型,不要将key和value的类型写死
*/
public class AuthOutputFormat<K,V> extends FileOutputFormat<K, V>{
@Override
public RecordWriter<K, V> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
//通过父类提供的方法,获取文件的输出路径
Path path = super.getDefaultWorkFile(job, "");
Configuration conf = job.getConfiguration();
FileSystem system = path.getFileSystem(conf);
//获取输出结果文件的输出流,不能直接用OutputStream接。
//RecordWriter通过此输出流输出结果
FSDataOutputStream out = system.create(path);
return new AuthRecordWriter<K,V>(out);
}
}
package hadoop02;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MapperDemo extends Mapper<IntWritable, Text, IntWritable, Text>{
@Override
protected void map(IntWritable key, Text value, Mapper<IntWritable, Text, IntWritable, Text>.Context context)
throws IOException, InterruptedException {
context.write(key, value);
}
}
package hadoop02;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class DriverDemo {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance();
job.setJarByClass(DriverDemo.class);
job.setMapperClass(MapperDemo.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(Text.class);
//默认是FileInputFomat,底层会调用LineRecordReader。(key=行首偏移量,value=每行内容)
job.setInputFormatClass(AuthInputFomat.class);
//设置自定义的输出格式组件,默认是FileOutputFormat的子类:TextOutputFormat
//TextOutputFormat的作用:k v的分隔符是Tab制表符,行之间的分隔符是 换行符
job.setOutputFormatClass(AuthOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path("hdfs://192.168.101.100:9000/input"));
FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.101.100:9000/result"));
job.waitForCompletion(true);
}
}