hadoop mapreduce join 合并操作

最新推荐文章于 2022-03-11 17:21:50 发布

阿朱__

最新推荐文章于 2022-03-11 17:21:50 发布

阅读量191

点赞数

分类专栏： Mapreducer 文章标签： mapreduce

本文链接：https://blog.csdn.net/qq_41371858/article/details/89537180

版权

Mapreducer 专栏收录该内容

6 篇文章 0 订阅

订阅专栏

a表数据：

id year bb

1     2010     1999
1     2011     1998
2     2010     1997
2     2011     1996
4     2010     1995
4     2011     1994
9     2010     1993
9     2011     1992

b表数据：

id address

1     哈哈1
2     哈哈2
3     哈哈3
4     哈哈4
5     哈哈5
6     哈哈6
7     哈哈7
8     哈哈8
9     哈哈9

根据b表id对应的address 在a表后面添加一行字段address 与id匹配

输出结果：

1        1     2011     1999       哈哈1
1        1     2010     1998       哈哈1
2        2     2011     1997       哈哈2
2        2     2010     1996       哈哈2
4        4     2011     1995       哈哈4
4        4     2010     1994       哈哈4
9        9     2011     1993       哈哈9
9        9     2010     1992       哈哈9

代码：

package mapreduce.joinmapreducer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.Vector;

public class Join{
    //Mapper方法
    static class Map extends Mapper<LongWritable, Text,LongWritable,Text>{
        String splitStr = " ";
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            //获取输入文件的文件名
            String inputname = ((FileSplit)context.getInputSplit()).getPath().getName();
            String line = value.toString();

            //抛弃所有空数据
            if(line == null || line.equals(""))
                 return;

            //按逗分割数据,取出id
            String[] linesplite = line.split(splitStr);
            int id =  Integer.parseInt(linesplite[0]);
            //处理来自各表的数据
            switch (inputname){
                case "a.txt":
                    context.write(new LongWritable(id),new Text("a# "+line));break;
                case "b.txt":
                    context.write(new LongWritable(id),new Text("b# "+line));break;
                default: context.write(new LongWritable(id),new Text("c# "+line));break;
            }
        }
    }

    static class Red extends Reducer<LongWritable,Text,LongWritable,Text> {
        @Override
        protected void reduce(LongWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {

            Vector<String> a = new Vector<String>();
            Vector<String> b = new Vector<String>();
            for(Text t : values){
                String[] line = t.toString().split(" ");
                if(line[0].equals("a#")){
                    a.add(t.toString());
                }
                else if(line[0].equals("b#")){
                    b.add(t.toString());
                }
            }

            for(int i=0;i<a.size();i++){
                for(int j=0;j<b.size();j++){
                    context.write(key,new Text(a.get(i).substring(2)+"  "+b.get(j).substring(4)));
                }

            }
            }
        }

    //main方法
    static class Mappereduce {
        public static void main(String[] args)throws Exception{
            Configuration configuration = new Configuration();
            Job job = Job.getInstance(configuration);
            job.setJarByClass(Mappereduce.class);
            //输出文件路径
            Path outputpath = new Path("/data/testdata/output");
            //判断输出文件是否存在，存在则删除。
            FileSystem.get(configuration).delete(outputpath,true);
            //设置mapper\reducer类
            job.setMapperClass(Map.class);
            job.setReducerClass(Red.class);
            //设置mapper\reducer 输出key,value的类型
            job.setMapOutputKeyClass(LongWritable.class);
            job.setMapOutputValueClass(Text.class);
            job.setOutputKeyClass(LongWritable.class);
            job.setOutputValueClass(Text.class);

            //设置输入和输入路径
            FileInputFormat.addInputPath(job,new Path("/data/testdata/"));
            FileOutputFormat.setOutputPath(job,outputpath);
            job.waitForCompletion(true);
        }
    }


}