自定义输入需求
将下面的三个文件的内容读取到一个SequenceFile中,SequenceFile是hadoop中特有的文件格式,适合key-value的存储。比普通文件格式节省空间。现在默认的输出格式是TextOutputFormat(文本格式的输出),改为SequenceFileOutputFormat。将输入目录的文件读取key-value(bytes)形式,将文件的内容读取封装为bytes类型,然后将文件名作为key。
先来看看底层的map方法
下面的方法最重要的就是nextKeyValue();此方法的内部实现将我们需要的值设为key和value,所以我们要重写该方法。
使用maptask的runNewMapper方法开始正式的map阶段
1、根据自定义map类名,获得自定义map对象
2、调用Mapper的run函数来运行用户自定义的map方法
//设置相关变量或者参数,一个map只调用一次
setup(context);
try {
while (context.nextKeyValue()) {
//使用while循环调用自定义map的方法
map(context.getCurrentKey(), context.getCurrentValue(), context);
}
} finally {
//清理过程,包括清理一些没用的k-v
cleanup(context);
}
编写自定义输入格式
import java.io.IOException;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
/*
* 1. 改变切片策略,一个文件固定切1片,通过指定文件不可切
*
* 2. 提供RecordReader ,这个RecordReader读取切片的文件名作为key,读取切片的内容封装到bytes作为value
*/
public class MyInputFormat extends FileInputFormat {
@Override
public RecordReader createRecordReader(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
return new MyRecordReader();
}
// 重写isSplitable
@Override
protected boolean isSplitable(JobContext context, Path filename) {
return false;
}
}
编写自定义记录读取器
该部分重写nextKeyValue()方法。然后在initialize中初始化一些我们需要的值。
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
/*
* RecordReader从MapTask处理的当前切片中读取数据
*
* XXXContext都是Job的上下文,通过XXXContext可以获取Job的配置Configuration对象
*/
public class MyRecordReader extends RecordReader {
private Text key;
private BytesWritable value;
private String filename;
private int length;
private FileSystem fs;
private Path path;
private FSDataInputStream is;
private boolean flag=true;
// MyRecordReader在创建后,在进入Mapper的run()之前,自动调用
// 文件的所有内容设置为1个切片,切片的长度等于文件的长度
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
FileSplit fileSplit=(FileSplit) split;
filename=fileSplit.getPath().getName();
length=(int) fileSplit.getLength();
path=fileSplit.getPath();
//获取当前Job的配置对象
Configuration conf = context.getConfiguration();
//获取当前Job使用的文件系统
fs=FileSystem.get(conf);
is = fs.open(path);
}
// 读取一组输入的key-value,读到返回true,否则返回false
// 将文件的名称封装为key,将文件的内容封装为BytesWritable类型的value,返回true
// 第二次调用nextKeyValue()返回false
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (flag) {
//实例化对象
if (key==null) {
key=new Text();
}
if (value==null) {
value=new BytesWritable();
}
//赋值
//将文件名封装到key中
key.set(filename);
// 将文件的内容读取到BytesWritable中
byte [] content=new byte[length];
IOUtils.readFully(is, content, 0, length);
value.set(content, 0, length);
flag=false;
return true;
}
return false;
}
//返回当前读取到的key-value中的key
@Override
public Object getCurrentKey() throws IOException, InterruptedException {
return key;
}
//返回当前读取到的key-value中的value
@Override
public Object getCurrentValue() throws IOException, InterruptedException {
return value;
}
//返回读取切片的进度
@Override
public float getProgress() throws IOException, InterruptedException {
return 0;
}
// 在Mapper的输入关闭时调用,清理工作
@Override
public void close() throws IOException {
if (is != null) {
IOUtils.closeStream(is);
}
if (fs !=null) {
fs.close();
}
}
}
Mapper类
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class CustomIFMapper extends Mapper<Text, BytesWritable, Text, BytesWritable>{
}
Reducer类
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class CustomIFReducer extends Reducer<Text, BytesWritable, Text, BytesWritable>{
}
设置Driver
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
public class CustomIFDriver {
public static void main(String[] args) throws Exception {
Path inputPath=new Path("C:\\Users\\Lenovo\\Desktop\\PPT\\input");
Path outputPath=new Path("C:\\Users\\Lenovo\\Desktop\\PPT\\output");
//作为整个Job的配置
Configuration conf = new Configuration();
//保证输出目录不存在
FileSystem fs=FileSystem.get(conf);
if (fs.exists(outputPath)) {
fs.delete(outputPath, true);
}
// ①创建Job
Job job = Job.getInstance(conf);
job.setJarByClass(CustomIFDriver.class);
// 为Job创建一个名字
job.setJobName("wordcount");
// ②设置Job
// 设置Job运行的Mapper,Reducer类型,Mapper,Reducer输出的key-value类型
job.setMapperClass(CustomIFMapper.class);
job.setReducerClass(CustomIFReducer.class);
// Job需要根据Mapper和Reducer输出的Key-value类型准备序列化器,通过序列化器对输出的key-value进行序列化和反序列化
// 如果Mapper和Reducer输出的Key-value类型一致,直接设置Job最终的输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(BytesWritable.class);
// 设置输入目录和输出目录
FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
// 设置输入和输出格式
job.setInputFormatClass(MyInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
// ③运行Job
job.waitForCompletion(true);
}
}
执行结果
自定义输出需求
需求
过滤输入的log日志,包含baidu的网站输出到e:/baidu.log,不包含baidu的网站输出到e:/other.log。
实现代码
Mapper
/*
* 1.什么时候需要Reduce
* ①合并
* ②需要对数据排序
*
* 2. 没有Reduce阶段,key-value不需要实现序列化
*/
public class CustomOFMapper extends Mapper<LongWritable, Text, String, NullWritable>{
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, String, NullWritable>.Context context)
throws IOException, InterruptedException {
String content = value.toString();
context.write(content+"\r\n", NullWritable.get());
}
}
自定义outputformat
public class MyOutPutFormat extends FileOutputFormat<String, NullWritable>{
@Override
public RecordWriter<String, NullWritable> getRecordWriter(TaskAttemptContext job)
throws IOException, InterruptedException {
return new MyRecordWriter(job);
}
}
自定义记录写出
public class MyRecordWriter extends RecordWriter<String, NullWritable> {
private Path baiduPath=new Path("e:/baidu.log");
private Path otherPath=new Path("e:/other.log");
private FSDataOutputStream baiduOS ;
private FSDataOutputStream otherOS ;
private FileSystem fs;
private TaskAttemptContext context;
public MyRecordWriter(TaskAttemptContext job) throws IOException {
context=job;
Configuration conf = job.getConfiguration();
fs=FileSystem.get(conf);
baiduOS = fs.create(baiduPath);
otherOS = fs.create(otherPath);
}
// 负责将key-value写出到文件
@Override
public void write(String key, NullWritable value) throws IOException, InterruptedException {
if (key.contains("baidu")) {
baiduOS.write(key.getBytes());
}else {
otherOS.write(key.getBytes());
}
}
// 关闭操作
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
if (atguguOS != null) {
IOUtils.closeStream(baiduOS);
}
if (otherOS != null) {
IOUtils.closeStream(otherOS);
}
if (fs != null) {
fs.close();
}
}
}
driver
public class CustomOFDriver {
public static void main(String[] args) throws Exception {
Path inputPath=new Path("e:/mrinput/outputformat");
Path outputPath=new Path("e:/mroutput/outputformat");
//作为整个Job的配置
Configuration conf = new Configuration();
//保证输出目录不存在
FileSystem fs=FileSystem.get(conf);
if (fs.exists(outputPath)) {
fs.delete(outputPath, true);
}
// ①创建Job
Job job = Job.getInstance(conf);
job.setJarByClass(CustomOFDriver.class);
// 为Job创建一个名字
job.setJobName("wordcount");
// ②设置Job
// 设置Job运行的Mapper,Reducer类型,Mapper,Reducer输出的key-value类型
job.setMapperClass(CustomOFMapper.class);
// 设置输入目录和输出目录
FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
// 设置输入和输出格式
job.setOutputFormatClass(MyOutPutFormat.class);
// 取消reduce阶段
job.setNumReduceTasks(0);
// ③运行Job
job.waitForCompletion(true);
}
}