根据内容的不同,数据写到不同的文件中
代码
package cn.feizhou.logenhance;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class LogEnhance {
static class LogEnhanceMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
Text k = new Text();
NullWritable v = NullWritable.get();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 获取一个计数器用来记录不合法的日志行数, 组名, 计数器名称
Counter counter = context.getCounter("malformed", "malformedline");
String line = value.toString();
String[] fields = StringUtils.split(line, ",");
String name = fields[0];
if (name.contains("order")||name.contains("pid")) {
k.set(line+"--");
context.write(k, v);
} else {
//计数器+1
counter.increment(1);
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(LogEnhance.class);
job.setMapperClass(LogEnhanceMapper.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
// 要控制不同的内容写往不同的目标路径,可以采用自定义outputformat的方法
job.setOutputFormatClass(LogEnhanceOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path("H:/test/"));
// 尽管我们用的是自定义outputformat,但是它是继承制fileoutputformat
// 在fileoutputformat中,必须输出一个_success文件,所以在此还需要设置输出path
FileOutputFormat.setOutputPath(job, new Path("H:/out/"));
// 不需要reducer
job.setNumReduceTasks(0);
job.waitForCompletion(true);
System.exit(0);
}
}
-------------------------------------
package cn.feizhou.logenhance;
import java.io.IOException;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* maptask或者reducetask在最终输出时,先调用OutputFormat的getRecordWriter方法拿到一个RecordWriter
* 然后再调用RecordWriter的write(k,v)方法将数据写出
*
* @author
*
*/
public class LogEnhanceOutputFormat extends FileOutputFormat<Text, NullWritable> {
@Override
public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {
FileSystem fs = FileSystem.get(context.getConfiguration());
Path orderPath = new Path("H:/out/en/order.txt");
Path pidPath = new Path("H:/out/en/pid.txt");
FSDataOutputStream orderPathOs = fs.create(orderPath);
FSDataOutputStream pidPathOs = fs.create(pidPath);
return new EnhanceRecordWriter(orderPathOs, pidPathOs);
}
/**
* 构造一个自己的recordwriter
*
* @author
*
*/
static class EnhanceRecordWriter extends RecordWriter<Text, NullWritable> {
FSDataOutputStream orderPathOs = null;
FSDataOutputStream pidPathOs = null;
public EnhanceRecordWriter(FSDataOutputStream orderPathOs, FSDataOutputStream pidPathOs) {
super();
this.orderPathOs = orderPathOs;
this.pidPathOs = pidPathOs;
}
@Override
public void write(Text key, NullWritable value) throws IOException, InterruptedException {
String result = key.toString();
// 如果要写出的数据是订单数据,则写入 H:/out/en/order.dat
if (result.contains("order_")) {
orderPathOs.write(result.getBytes());
} else {
// 如果要写出的数据是产品数据,则写入 H:/out/en/pid.dat
pidPathOs.write(result.getBytes());
}
}
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
if (orderPathOs != null) {
orderPathOs.close();
}
if (pidPathOs != null) {
pidPathOs.close();
}
}
}
}
测试数据orders.txt
order_0000001,22
order_0000001,22
order_0000002,22
pid_0000002,22
pid_0000002,22
xx_0000003,22
xx_0000006,22
xx_0000005,22
xx_0000004,22
结果
order.txt
order_0000001,22--order_0000001,22--order_0000002,22--
pid.txt
pid_0000002,22--pid_0000002,22--