1.背景
为了可以自定义输出文件的路径和格式,可以自定义OutputFormat
2.需求
过滤输入的log日志,包含jinghang的网站输出到e:/jinghang.log,不包含jinghang的网站输出到e:/other.log。
3.代码实现
1.mapper端
package com.zj.practice.mapreduce05.user_defined_OutputFormat;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class filterMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
/**
* 将文本数据转化为<key,value>
*
* @param key 每一行的偏移量
* @param value 每一行的字符串内容
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
/*
输入端的value --> http://www.baidu.com
输出端将其作为key输出,value为空
*/
context.write(value, NullWritable.get());
}
}
2.reduce端
package com.zj.practice.mapreduce05.user_defined_OutputFormat;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class filterReducer extends Reducer<Text, NullWritable, Text, NullWritable> {
@Override
protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
/**
* 直接写出到driver,因为不需要统计<key,value>值 --> 写出的key: --> http://www.baidu.com
*/
context.write(key, NullWritable.get());
}
}
3.自定义OutputFormat类
package com.zj.practice.mapreduce05.user_defined_OutputFormat;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* 重写FileOutputFormat
*/
public class filterOutputFormat extends FileOutputFormat<Text, NullWritable> {
@Override
public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
/**
* 返回RecordWriter对象,处理具体的业务逻辑
*/
filterRecordWriter filterRecordWriter = new filterRecordWriter();
// 调用初始化方法
filterRecordWriter.init(job);
return filterRecordWriter;
}
}
4.重写RecordWriter方法核心类
package com.zj.practice.mapreduce05.user_defined_OutputFormat;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* 重写RecordWriter方法,实现自定义的需求:
* 过滤输入的log日志,包含jinghang的网站输出到e:/jinghang.log,
* 不包含jinghang的网站输出到e:/other.log。
*/
public class filterRecordWriter extends RecordWriter<Text, NullWritable> {
/*
定义两个输出对象
*/
private FSDataOutputStream jinghang;
private FSDataOutputStream other;
/**
* 初始化,获取文件输出路径
*
* @param job 要提交的作业
*/
public void init(TaskAttemptContext job) throws IOException {
// 获取文件输出路径
String outPath = job.getConfiguration().get(FileOutputFormat.OUTDIR).replace("file:/", "");
// 文件系统对象
FileSystem fileSystem = FileSystem.get(job.getConfiguration());
// 初始化两个输出路径对象
jinghang = fileSystem.create(new Path(outPath + "jinghang.log"), true);
other = fileSystem.create(new Path(outPath + "other.log"), true);
}
/**
* 自定义方式写到文件中
*
* @param key
* @param value
* @throws IOException
* @throws InterruptedException
*/
@Override
public void write(Text key, NullWritable value) throws IOException, InterruptedException {
// 将每一行转化为Java的字符串
String line = key.toString() + "\n";
// 判断
if (line.contains("jinghang")) {
jinghang.write(line.getBytes());
} else {
other.write(line.getBytes());
}
}
/**
* 释放资源
*
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
IOUtils.closeStream(jinghang);
IOUtils.closeStream(other);
}
}
5.driver
package com.zj.practice.mapreduce05.user_defined_OutputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class filterDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
// 获取配置文件
Configuration configuration = new Configuration();
// 获取job对象
Job job = Job.getInstance(configuration);
// 设置自定义的outputFormat
job.setOutputFormatClass(filterOutputFormat.class);
// 加载Jar包的路径
job.setJarByClass(filterDriver.class);
// 加载mapper、reduce
job.setMapperClass(filterMapper.class);
job.setReducerClass(filterReducer.class);
// 设置mapper、reducer数据类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
// 设置数据源、输出的路径
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 提交作业
boolean res = job.waitForCompletion(true);
System.exit(res ? 0 : 1);
}
}