MultipleOutputs实战：结果输出到多个文件夹或者文件中

最新推荐文章于 2020-05-07 16:55:39 发布

nana-li

最新推荐文章于 2020-05-07 16:55:39 发布

阅读量398

点赞数

分类专栏： Programming 文章标签： Hadoop开发 MultipleOutputs 实例

Programming 专栏收录该内容

25 篇文章 2 订阅

订阅专栏

转载链接： http://blog.csdn.net/garychenqin/article/details/48339327（在原文基础上增加了代码分析、执行和结果）

1、原理技术

输出到多个文件或多个文件夹，驱动中不需要额外改变，只需要在MapClass或Reduce类中加入如下代码：

private MultipleOutputs<Text,IntWritable> mos;
public void setup(Context context) throws IOException,InterruptedException {
　　mos = new MultipleOutputs(context);
}
public void cleanup(Context context) throws IOException,InterruptedException {
　　mos.close();
}

然后就可以用mos.write(Key key,Value value,String baseOutputPath)代替context.write(key, value);
在MapClass或Reduce中使用，输出时也会有默认的文件part-m-00*或part-r-00*，不过这些文件是无内容的，大小为0. 而且只有part-m-00*会传给Reduce。

注意：multipleOutputs.write(key, value, baseOutputPath)方法的第三个函数表明了该输出所在的目录（相对于用户指定的输出目录）。
如果baseOutputPath不包含文件分隔符“/”，那么输出的文件格式为baseOutputPath-r-nnnnn（name-r-nnnnn)；
如果包含文件分隔符“/”，例如baseOutputPath=“029070-99999/1901/part”，那么输出文件则为029070-99999/1901/part-r-nnnnn

2、案例-需求

需求，下面是有些测试数据，要对这些数据输出2个文件，一个是文件中只包含key，另一个文件中包含key和value：

1512,iphone5s,4英寸,指纹识别,A7处理器,64位,M7协处理器,低功耗

1512,iphone5,4英寸,A6处理器,IOS7

1512,iphone4s,3.5英寸,A5处理器,双核,经典

50019780,ipad,9.7英寸,retina屏幕,丰富的应用

50019780,yoga,联想,待机18小时,外形独特

50019780,nexus 7,华硕&google,7英寸

50019780,ipad mini 2,retina显示屏,苹果,7.9英寸

1101,macbook air,苹果超薄,OS X mavericks

1101,macbook pro,苹果,OS X lion

1101,thinkpad yoga,联想,windows 8,超级本
 
 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
 
 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19

3、代码

（1）Mapper程序：

package cn.edu.bjut.multioutput;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class MultiOutPutMapper extends Mapper<LongWritable, Text, IntWritable, Text> {

    //将每行的“,”之前的元素作为key，其余值作为value
    @Override
    protected void map(LongWritable key, Text value, Context context)
            throws IOException, InterruptedException {
        String line = value.toString().trim();
        if(null != line && 0 != line.length()) {
            String[] arr = line.split(",");
            context.write(new IntWritable(Integer.parseInt(arr[0])), value);            //存储的key-value格式，
	    //举例：“1101,thinkpad yoga,联想,windows 8,超级本”将被存储为“1101-1101,thinkpad yoga,联想,windows 8,超级本”
        }
    }

}

 
 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
 
 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23

（2）Reducer程序：

package cn.edu.bjut.multioutput;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;

public class MultiOutPutReducer extends
        Reducer<IntWritable, Text, NullWritable, Text> {

    private MultipleOutputs<NullWritable, Text> multipleOutputs = null;

    @Override
    protected void reduce(IntWritable key, Iterable<Text> values, Context context)
            throws IOException, InterruptedException {
        for(Text text : values) {            //存到一个文件中，文件名为：KeySplit-r-00000，内容为：key置空，value是key的值
	    //即，正常输出key-value是“1101-1101,thinkpad yoga,联想,windows 8,超级本”，这里只显示“1101/”。
            multipleOutputs.write("KeySpilt", NullWritable.get(), text, key.toString()+"/");
            //存到另一个文件中，文件名为：KeySplit-r-00000，内容为：key置空，value是value的值
	    //即，正常输出key-value是“1101-1101,thinkpad yoga,联想,windows 8,超级本”，这里只显示“1101,thinkpad yoga,联想,windows 8,超级本”。
            multipleOutputs.write("AllPart", NullWritable.get(), text);
        }
    }

    @Override
    protected void setup(Context context)
            throws IOException, InterruptedException {
        multipleOutputs = new MultipleOutputs<NullWritable, Text>(context);
    }

    @Override
    protected void cleanup(Context context)
            throws IOException, InterruptedException {
        if(null != multipleOutputs) {
            multipleOutputs.close();
            multipleOutputs = null;
        }
    }


}

 
 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
 
 1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42

（3）主程序：

package cn.edu.bjut.multioutput;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

public class MainJob {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();        //MultiOutput是作业名
        Job job = new Job(conf, "MultiOutput");
        job.setJarByClass(MainJob.class);

        job.setMapperClass(MultiOutPutMapper.class);
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(MultiOutPutReducer.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(Text.class);

        FileInputFormat.addInputPath(job, new Path(args[0]));

        MultipleOutputs.addNamedOutput(job, "KeySpilt", TextOutputFormat.class, NullWritable.class, Text.class);
        MultipleOutputs.addNamedOutput(job, "AllPart", TextOutputFormat.class, NullWritable.class, Text.class);

        Path outPath = new Path(args[1]);
        FileSystem fs = FileSystem.get(conf);
        if(fs.exists(outPath)) {
            fs.delete(outPath, true);
        }
        FileOutputFormat.setOutputPath(job, outPath);

        job.waitForCompletion(true);
    }
}