MapReducer多表查询

11 篇文章 0 订阅

一个例子晚上例子,别处看来的,但是有些代码看不懂

表1

factoryname addressed
Beijing Red Star 1
Shenzhen Thunder 3
Guangzhou Honda 2
Beijing Rising 1
Guangzhou Development Bank 2
Tencent 3
Back of Beijing 1

表2

addressID addressname
1 Beijing
2 Guangzhou
3 Shenzhen
4 Xian

package HbaseCoprocessor;


import org.apache.hadoop.mapreduce.Mapper;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;

import java.util.*;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.GenericOptionsParser;

public class MoreTableMapReducer {
	private static int time = 0;  
    public static class Map extends Mapper<Object,Text,Text,Text>{  
        //在map中先区分输入行属于左表还是右表,然后对两列值进行分割,  
        //保存连接列在key值,剩余列和左右表标志在value中,最后输出  
        public void map(Object key,Text value,Context context) throws IOException,InterruptedException{  
            String line = value.toString();  
            int i = 0;  
            //输入文件首行,不处理  
            if(line.contains("factoryname")==true||line.contains("addressID")==true){  
                return;  
            }  
            //找出数据中的分割点  
            while(line.charAt(i)>='9'||line.charAt(i)<='0'){  
                i++;  
            }  
            if(line.charAt(i)>='9'||line.charAt(i)<='0'){  
                //左表  
                int j = i-1;  
                while(line.charAt(j)!=' ') j--;  
                String[] values = {line.substring(0, j),line.substring(i)};  
                context.write(new Text(values[1]), new Text("1+"+values[0]));  
            }else{//右表  
                int j = i+1;  
                while(line.charAt(j)!=' ') j++;  
                String[] values = {line.substring(0, i+1),line.substring(j)};  
                context.write(new Text(values[0]), new Text("2+"+values[1]));  
            }  
        }  
    }  
    public static class Reduce extends Reducer<Text,Text,Text,Text>{  
        //reduce解析map输出,将value中数据按照左右表分别保存,然后求笛卡尔积,输出  
        public void reduce(Text key,Iterable<Text> values,Context context) throws IOException,InterruptedException{  
            if(time == 0){//输入文件第一行  
                context.write(new Text("factoryname"),new Text("addressname"));  
                time++;  
            }  
            int factorynum = 0;  
            String factory[] = new String[10];  
            int adressnum = 0;  
            String adress[] = new String[10];  
            Iterator iter = values.iterator();  
            while(iter.hasNext()){  
                String record = iter.next().toString();  
                int len = record.length();  
                int i = 2;  
                char type = record.charAt(0);  
                String factoryname = new String();  
                String adressname = new String();  
                if(type == '1'){//左表  
                    factory[factorynum] = record.substring(2);  
                    factorynum++;  
                }else{//右表  
                    adress[adressnum] = record.substring(2);  
                }  
            }  
            if(factorynum!=0&&adressnum!=0){//笛卡尔积  
                for(int m=0;m<factorynum;m++){  
                    for(int n=0;n<adressnum;n++){  
                        context.write(new Text(factory[m]), new Text(adress[n]));  
                    }  
                }  
            }  
        }  
    }  
    /** 
     * @param args 
     */  
    public static void main(String[] args) throws Exception{  
        // TODO Auto-generated method stub  
        Configuration conf = new Configuration();  
        String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();  
        if(otherArgs.length != 2){  
            System.err.println("Usage WordCount <int> <out>");  
            System.exit(2);  
        }  
        Job job = new Job(conf,"word count");  
        job.setJarByClass(MoreTableMapReducer.class);  
        job.setMapperClass(Map.class);  
        job.setCombinerClass(Reduce.class);  
        job.setReducerClass(Reduce.class);  
        job.setOutputKeyClass(Text.class);  
        job.setOutputValueClass(Text.class);  
        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));  
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));  
        System.exit(job.waitForCompletion(true) ? 0 : 1);  
    }  
}
后来自己琢磨了一下, 搞出来一个自己写的通俗版本的

输出结果为

 

factoryname	addressname
Back of Beijing	Beijing	1
Beijing Rising	Beijing	1
Beijing Red Star	Beijing	1
Guangzhou Development Bank	Guangzhou	2
Guangzhou Honda	Guangzhou	2
Tencent	Shenzhen	3
Shenzhen Thunder	Shenzhen	3

代码为  

package HbaseCoprocessor;

import org.apache.hadoop.mapreduce.Mapper;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;

import java.util.*;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.GenericOptionsParser;

public class MTjoin {

	public static int time = 0;

	/*
	 * 
	 * 在map中先区分输入行属于左表还是右表,然后对两列值进行分割,
	 * 
	 * 保存连接列在key值,剩余列和左右表标志在value中,最后输出
	 */

	public static class Map extends Mapper<Object, Text, Text, Text> {
		// 实现map函数
		public void map(Object key, Text value, Context context)
		throws IOException, InterruptedException {
			String line = value.toString();// 每行文件
			// 输入文件首行,不处理									   
			if (line.contains("factoryname") == true || line.contains("addressID") == true) {
				return;
			}
		 	String[] arr=line.split("\t");
		 	//第几张表  根据表的字段来进行判断
		 	String tableNum="";
		 	if (arr[0].length()>1) {
				tableNum="1";
				context.write(new Text(arr[1]), new Text(tableNum + "_" + arr[0]));
			}else{
				tableNum="2";
				context.write(new Text(arr[0]), new Text(tableNum + "_" + arr[1]));
			}
		}

	}

	/*
	 * 
	 * reduce解析map输出,将value中数据按照左右表分别保存,
	 * 
	 *   * 然后求出笛卡尔积,并输出。
	 */

	public static class Reduce extends Reducer<Text, Text, Text, Text> {

		// 实现reduce函数

		public void reduce(Text key, Iterable<Text> values, Context context)

		throws IOException, InterruptedException {
			// 输出表头
			if (0 == time) {
				context.write(new Text("factoryname"), new Text("addressname"));
				time++;
			}		 
			Iterator ite = values.iterator();
			List<String> tableList=new ArrayList<String>();
			String str2="";
			while (ite.hasNext()) {
				String record = ite.next().toString();
				//切割表2的内容
				String[] arr2=record.split("_");
				if (record.contains("2")) {
					str2=arr2[1];
				}else if(record.contains("1")){
					tableList.add(arr2[1]);
				}
			}
			for (int i = 0; i < tableList.size(); i++) {
				context.write(new Text(tableList.get(i)), new Text(str2+"\t"+key.toString()));
				System.out.println(key.toString());
			}
			
		}
	}
	public static void deleteAllFilesOfDir(File path) {  
	    if (!path.exists())  
	        return;  
	    if (path.isFile()) {  
	        path.delete();  
	        return;  
	    }  
	    File[] files = path.listFiles();  
	    for (int i = 0; i < files.length; i++) {  
	        deleteAllFilesOfDir(files[i]);  
	    }  
	    path.delete();  
	} 
	public static void main(String[] args) throws Exception {

		File file= new File("F:\\hadoop\\out");
		 
		deleteAllFilesOfDir(file);
		
		
		Configuration conf = new Configuration();

		// 这句话很关键

		System.setProperty("hadoop.home.dir",
				"F:/alljar/hadoop-2.4.1-x64/hadoop-2.4.1");

		Job job = new Job(conf, "Multiple Table Join");

		job.setJarByClass(MTjoin.class);

		// 设置Map和Reduce处理类

		job.setMapperClass(Map.class);

		job.setReducerClass(Reduce.class);

		// 设置输出类型

		job.setOutputKeyClass(Text.class);

		job.setOutputValueClass(Text.class);

		// 设置输入和输出目录

		FileInputFormat.addInputPath(job, new Path("F:\\hadoop\\in"));

		FileOutputFormat.setOutputPath(job, new Path("F:\\hadoop\\out"));

		System.exit(job.waitForCompletion(true) ? 0 : 1);

	}

}

这本来是一个面试题。当时其实也想到了方法,但是面试者出的字段不清不楚,加上有点紧张一下子没理清楚思路,回来写了一下并不难,记录一下, 下一次可以轻松面对了。也顺便想尝试一下3表的代码, 逻辑似乎一下,hive的底层应该就是这样的

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值