输入数据
http://www.baidu.com
http://www.google.com
http://cn.bing.com
https://www.baidu.com
http://www.sohu.com
http://www.sina.com
http://www.baidu.com
http://www.sin2a.com
http://www.sin2desa.com
http://www.sindsafa.com
Maven和log4j.properties配置
参考 MapReduce统计流量案例 中的配置
自定义Mapper类实现(LogMapper)
package com.test.mapreduce.outputformat;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class LogMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 直接将每行数据,写出。value为NULL即可
context.write(value, NullWritable.get());
}
}
自定义Reducer类实现(LogReducer)
package com.test.mapreduce.outputformat;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class LogReducer extends Reducer<Text, NullWritable, Text, NullWritable> {
@Override
protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
// 防止有相同数据被过滤,迭代写出。
for (NullWritable value : values) {
context.write(key, value);
}
}
}
自定义FileOutputFormat类实现(LogOutputFormat)
package com.test.mapreduce.outputformat;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class LogOutputFormat extends FileOutputFormat<Text, NullWritable> {
@Override
public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
// 创建并返回自定义LogRecordWriter(实现内部具体输出方法)
LogRecordWriter logRecordWriter = new LogRecordWriter(job);
return logRecordWriter;
}
}
自定义RecordWriter类实现(LogRecordWriter)
package com.test.mapreduce.outputformat;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import java.io.IOException;
public class LogRecordWriter extends RecordWriter<Text, NullWritable> {
// 定义两个输出对象
private FSDataOutputStream baiduOut;
private FSDataOutputStream otherOut;
/**
* 初始化方法
*/
public LogRecordWriter(TaskAttemptContext job) {
try {
// 获取文件系统对象
FileSystem fs = FileSystem.get(job.getConfiguration());
// 用文件系统对象创建两个输出流对应不同目录
baiduOut = fs.create(new Path("D:\\output\\baidu.log"));
otherOut = fs.create(new Path("D:\\output\\other.log"));
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 重写write逻辑
*/
@Override
public void write(Text text, NullWritable nullWritable) throws IOException, InterruptedException {
// 将输入数据转换为字符串
String log = text.toString();
// 包含baidu的往baidu.log输出,其他往other.log输出
if (log.contains("baidu")) {
baiduOut.writeBytes(log + "\n");
}else {
otherOut.writeBytes(log + "\n");
}
}
/**
* 重写close方法
*/
@Override
public void close(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
// 关闭文件对象
IOUtils.closeStream(baiduOut);
IOUtils.closeStream(otherOut);
}
}
自定义Reducer类实现(LogDriver)
package com.test.mapreduce.outputformat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class LogDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
// 1.创建配置信息Configuration对象并获取Job单例对象
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
// 2.设置关联本Driver程序的jar
job.setJarByClass(LogDriver.class);
// 3.设置关联Mapper和Reducer
job.setMapperClass(LogMapper.class);
job.setReducerClass(LogReducer.class);
// 4.设置Mapper输出的kv类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
// 5.设置最终输出的kv类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
// 6.设置自定义的outputformat
job.setOutputFormatClass(LogOutputFormat.class);
// 7.设置数据输入路径
FileInputFormat.setInputPaths(job, new Path("D:\\input"));
// 8.设置_SUCCESS文件输出路径
//虽然自定义了OutputFormat,但是因为OutputFormat继承自FileOutputFormat
//而FileOutputFormat要输出一个_SUCCESS文件,所以在这还得指定一个输出目录
FileOutputFormat.setOutputPath(job, new Path("D:\\output"));
// 9.提交job
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}
输出数据
baidu.log
http://www.baidu.com
http://www.baidu.com
https://www.baidu.com
other.log
http://cn.bing.com
http://www.google.com
http://www.sin2a.com
http://www.sin2desa.com
http://www.sina.com
http://www.sindsafa.com
http://www.sohu.com