mapreduce实现多文件自定义输出

最新推荐文章于 2020-12-22 02:43:32 发布

evo_steven

最新推荐文章于 2020-12-22 02:43:32 发布

阅读量7.3k

点赞数

分类专栏： mapreduce hadoop

本文链接：https://blog.csdn.net/evo_steven/article/details/14521197

版权

hadoop 同时被 2 个专栏收录

15 篇文章 0 订阅

订阅专栏

mapreduce

10 篇文章 0 订阅

订阅专栏

本人在项目中遇到一个问题，就是在处理日志的时候，需要有多个key，比如一行日志是 domain sip minf h b

而我处理的时候需要map输出为 key：domain+minf value h+"|"+b 和 key：sip+minf value h+"|"+b，而且还要做逻辑运算，比如相同的key的value要做累加，

普通的mr通常情况下，计算结果会以part-000*输出成多个文件，并且输出的文件数量和reduce数量一样，这样就没法区分各个输出在哪个文件中，所以这样也不利于后续将mr的运行结果再做处理。

下面介绍我的处理过程，啥也不说了，上代码：

ComplexKey 是我重写的类，实现了WritableComparable接口，便于对key排序，之所以排序，是希望将相同的key放到同一个reduce中去处理。

package sina.dip.logfilter.mr;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;

public class ComplexKey implements WritableComparable<ComplexKey> {

	private Text name;

	private Text value;

	private Text minf;

	public ComplexKey() {
		this.name = new Text();

		this.value = new Text();

		this.minf = new Text();
	}

	public ComplexKey(String name, String value, String minf) {
		this.name = new Text(name);

		this.value = new Text(value);

		this.minf = new Text(minf);
	}

	public Text getName() {
		return name;
	}

	public void setName(Text name) {
		this.name = name;
	}

	public Text getValue() {
		return value;
	}

	public void setValue(Text value) {
		this.value = value;
	}

	public Text getMinf() {
		return minf;
	}

	public void setMinf(Text minf) {
		this.minf = minf;
	}

	@Override
	public int compareTo(ComplexKey c) {
		int compare = 0;

		compare = name.compareTo(c.name);
		if (compare != 0) {
			return compare;
		}

		compare = value.compareTo(c.value);
		if (compare != 0) {
			return compare;
		}

		compare = minf.compareTo(c.minf);
		if (compare != 0) {
			return compare;
		}

		return 0;
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		name.readFields(in);

		value.readFields(in);

		minf.readFields(in);
	}

	@Override
	public void write(DataOutput out) throws IOException {
		name.write(out);

		value.write(out);

		minf.write(out);
	}

}

分区类：

package sina.dip.logfilter.mr;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

public class ComplexKeyPartitioner extends Partitioner<ComplexKey, Text> {

	@Override
	public int getPartition(ComplexKey key, Text value, int numPartitions) {
		return Math.abs(key.getValue().hashCode()) % numPartitions;
	}

}

这是map阶段：

package sina.dip.logfilter.mr;

import java.io.IOException;
import java.math.BigInteger;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
//map中的结果都放到了 mapoutput ，在clean的时候同意处理，将逻辑放在这边，是为了减小reduce的压力，之前累加的逻辑放入reduce，发现100G的数据，要跑大约10多分钟
   而map只用了1分钟，但是放入map中后，整个处理过程不到两分钟。 
public class AnalysisMapper extends
		Mapper<LongWritable, Text, ComplexKey, Text> {
	private MultipleOutputs<ComplexKey, Text> outputs;
	private Map<String,String> mapoutput = new HashMap<String,String>();
	private Set<String> outputkeyset;
	private String[] mapkey;
	private String[] mapvalue;
	private BigInteger paravalue;
	protected void setup(Context context) throws IOException,
			InterruptedException {
		outputs = new MultipleOutputs<ComplexKey, Text>(context);
	};
	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		String line = value.toString();
		if (line == null || line.isEmpty()) {
			return;
		}

		String[] words = line.split("\t");
//		System.out.println("words.length:"+words.length);
		if (words.length != 17 && words.length != 18) {
//			System.out.println("line:"+value.toString());
			return;
		}

//		if (words[0] == null || words[0].isEmpty() || words[1] == null
//				|| words[1].isEmpty() || words[2] == null || words[2].isEmpty()
//				|| words[14] == null || words[14].isEmpty()
//				|| words[16] == null || words[16].isEmpty()) {
//			return;
//		}
		BigInteger hit,bit;
		Text hb;
//		System.out.println("words.length:"+words.length);
		if(words[1].equals("172.20.20.37")){
			if(words.length == 17){
//				System.out.println("mapoutput17:"+mapoutput.size());
				
				hb = new Text(words[14] + "|" + words[16]);
				if(null != mapoutput.get("domain"+"|"+words[2]+"|"+words[0])){//如果结果中已经存在 domain|minf
					mapvalue = (mapoutput.get("domain"+"|"+words[2]+"|"+words[0])).toString().split("\\|");

					hit = new BigInteger(mapvalue[0]);
					bit = new BigInteger(mapvalue[1]);
					hit = hit.add(new BigInteger(words[14]));
					bit = bit.add(new BigInteger(words[16]));
					mapoutput.put("domain"+"|"+words[2]+"|"+words[0], hit+"|"+bit);
				}else{
					mapoutput.put("domain"+"|"+words[2]+"|"+words[0], words[14]+"|"+words[16]);
				}
				if(null != mapoutput.get("sip"+"|"+words[1]+"|"+words[0])){//如果结果中已经存在 sip|minf
					mapvalue = (mapoutput.get("sip"+"|"+words[1]+"|"+words[0])).toString().split("\\|");
					hit = new BigInteger(mapvalue[0]);
					bit = new BigInteger(mapvalue[1]);
					hit = hit.add(new BigInteger(words[14]));
					bit = bit.add(new BigInteger(words[16]));
					mapoutput.put("sip"+"|"+words[1]+"|"+words[0], hit+"|"+bit);
				}else{
					mapoutput.put("sip"+"|"+words[1]+"|"+words[0], words[14]+"|"+words[16]);
				}
			}else if(words.length == 18){
//				System.out.println("mapoutput18:"+mapoutput.size());
				hb = new Text(words[15] + "|" + words[17]);
				if(null != mapoutput.get("domain"+"|"+words[2]+"|"+words[0])){//如果结果中已经存在 domain|minf
					mapvalue = (mapoutput.get("domain"+"|"+words[2]+"|"+words[0])).toString().split("\\|");

					hit = new BigInteger(mapvalue[0]);
					bit = new BigInteger(mapvalue[1]);
					hit = hit.add(new BigInteger(words[15]));
					bit = bit.add(new BigInteger(words[17]));
					mapoutput.put("domain"+"|"+words[2]+"|"+words[0], hit+"|"+bit);
				}else{
					mapoutput.put("domain"+"|"+words[2]+"|"+words[0], words[15]+"|"+words[17]);
				}
				if(null != mapoutput.get("sip"+"|"+words[1]+"|"+words[0])){//如果结果中已经存在 sip|minf
					mapvalue = (mapoutput.get("sip"+"|"+words[1]+"|"+words[0])).toString().split("\\|");
					hit = new BigInteger(mapvalue[0]);
					bit = new BigInteger(mapvalue[1]);
					hit = hit.add(new BigInteger(words[15]));
					bit = bit.add(new BigInteger(words[17]));
					mapoutput.put("sip"+"|"+words[1]+"|"+words[0], hit+"|"+bit);
				}else{
					mapoutput.put("sip"+"|"+words[1]+"|"+words[0], words[15]+"|"+words[17]);
				}
			}
		}
		
	};
//多个输出，每个会有不同的key
 protected void cleanup(Context context) throws IOException,
		InterruptedException {
		outputkeyset = mapoutput.keySet();
		for(String outputkey : outputkeyset){
			mapkey = outputkey.split("\\|");
			if(mapkey[0].equals("domain")){
				mapvalue = mapoutput.get(outputkey).split("\\|");
//				System.out.println("domainh:"+mapvalue[0]);
				ComplexKey domain = new ComplexKey("domain", mapkey[1], mapkey[2]);
				Text hb = new Text(mapvalue[0] + "|" + mapvalue[1]);
				context.write(domain, hb);
			}else if(mapkey[0].equals("sip")){
				mapvalue = mapoutput.get(outputkey).split("\\|");
				ComplexKey sip = new ComplexKey("sip", mapkey[1], mapkey[2]);
				Text hb = new Text(mapvalue[0] + "|" + mapvalue[1]);
//				System.out.println("siph:"+mapvalue[0]);
				context.write(sip, hb);
			}
//			else if(mapkey[0].equals("httpcode")){
//				ComplexKey sip = new ComplexKey("httpcode", mapkey[1], mapkey[2]);
//				Text h = new Text(mapoutput.get(outputkey));
//				context.write(sip, h);
//			}
		}
		outputs.close();
	};
}

reduce：

package sina.dip.logfilter.mr;

import java.io.IOException;
import java.math.BigInteger;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
/**
 * 根据不同的key处理不同的逻辑，然后输出到相应的目录下
 *
 */
public class AnalysisReducerBack1 extends Reducer<ComplexKey, Text, Text, Text> {

	private MultipleOutputs<Text, Text> outputs;

	protected void setup(Context context) throws IOException,
			InterruptedException {
		outputs = new MultipleOutputs<Text, Text>(context);
	};
    //根据不同的key，处理不同的逻辑，并输出到不同的目录下
	protected void reduce(ComplexKey key, Iterable<Text> values, Context context)
			throws IOException, InterruptedException {
		Text oKey = null,oValue = null;
		BigInteger h = new BigInteger("0"),b = new BigInteger("0");
		if(key.getName().toString().equals("sip") || key.getName().toString().equals("domain")){
			
			
			for (Text value : values) {
				String[] words = value.toString().split("\\|");
				h = h.add(new BigInteger(words[0]));
				b = b.add(new BigInteger(words[1]));
//				h += Integer.valueOf(words[0]);
//				b += Integer.valueOf(words[1]);
			}

			oKey = new Text(key.getValue() + "\t" + key.getMinf());
			oValue = new Text(h + "\t" + b);
		}else if(key.getName().toString().equals("httpcode")){
			for (Text value : values) {
				h = h.add(new BigInteger(value.toString()));
//				h += Integer.valueOf(value.toString());
			}

			oKey = new Text(key.getValue() + "\t" + key.getMinf());
			oValue = new Text(String.valueOf(h));
		}
		

		outputs.write(oKey, oValue, key.getName().toString()+"/"+key.getName().toString());
//根据key输出，比如domain的key，则输出到了outputpath/domain/domain-part-000x;
//或者设置为outputs.write(oKey, oValue, key.getName().toString());则输出为outputpath/domain-part-000x;
      };

	protected void cleanup(Context context) throws IOException,
			InterruptedException {
		outputs.close();
	};

}

最后就是在job调用时设置了

package sina.dip.logfilter.mr;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import sina.dip.logfilter.DipFilterLogData;
import sina.dip.logfilter.config.LogConfig;
import sina.dip.logfilter.config.ServerConfig;
import sina.dip.logfilter.util.FileUtil;



public class AnalysisLoader {
	
	/**
	 * @param args
	 * @param conf
	 * @return
	 * @throws Exception
	 */
	public boolean run(Configuration conf, String inputPath, String outPath,String category)
	throws Exception {
		Job job = new Job(conf, "DIP_DIPLOGFILTER-"+category);
		DistributedCache.addFileToClassPath(new Path("/libs/hbase-0.92.1-cdh4.0.0-security.jar"), job.getConfiguration());
               //解决第三包的调用问题，在其他的文章中有介绍
                 job.setJarByClass(AnalysisLoader.class);


		job.setInputFormatClass(TextInputFormat.class);
		job.setOutputFormatClass(TextOutputFormat.class);

		job.setMapperClass(AnalysisMapper.class);
		job.setMapOutputKeyClass(ComplexKey.class);
		job.setMapOutputValueClass(Text.class);

		job.setPartitionerClass(ComplexKeyPartitioner.class);
//		job.setCombinerClass(AnalysisReducer.class);
		job.setReducerClass(AnalysisReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		job.setNumReduceTasks(LogConfig.reduceCount);
		String hdfs = ServerConfig.getHDFS();
		
		String[] inputPaths =inputPath.split(",");
		for (String p : inputPaths) {
			if (!p.startsWith(hdfs)) {
				p = hdfs + p;
			}
			MultipleInputs.addInputPath(job, new Path(p),TextInputFormat.class, AnalysisMapper.class);
		}
		
		FileOutputFormat.setOutputPath(job, new Path(outPath));
		
		return(job.waitForCompletion(true));
		
		
	}
}

evo_steven

关注

0
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
mapreduce实现多文件自定义输出

本人在项目中遇到一个问题，就是在处理日志的时候，需要有多个key，比如一行日志是 domain sip minf h b而我处理的时候需要map输出为 key：domain+minf value h+"|"+b 和key：sip+minf value h+"|"+b，而且还要做逻辑运算，比如相同的key的value要做累加，普通的mr通常情况下，计算结果会以part-000*输出成多个
复制链接

扫一扫