MapReducer多表查询

最新推荐文章于 2024-04-15 22:58:32 发布

a2615381

最新推荐文章于 2024-04-15 22:58:32 发布

阅读量948

点赞数

分类专栏： hadoop java

本文链接：https://blog.csdn.net/a2615381/article/details/73693032

版权

java 同时被 2 个专栏收录

32 篇文章 0 订阅

订阅专栏

hadoop

11 篇文章 0 订阅

订阅专栏

一个例子晚上例子，别处看来的，但是有些代码看不懂

表1

factoryname addressed
Beijing Red Star 1
Shenzhen Thunder 3
Guangzhou Honda 2
Beijing Rising 1
Guangzhou Development Bank 2
Tencent 3
Back of Beijing 1

表2

addressID addressname
1 Beijing
2 Guangzhou
3 Shenzhen
4 Xian

package HbaseCoprocessor;


import org.apache.hadoop.mapreduce.Mapper;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;

import java.util.*;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.GenericOptionsParser;

public class MoreTableMapReducer {
	private static int time = 0;  
    public static class Map extends Mapper<Object,Text,Text,Text>{  
        //在map中先区分输入行属于左表还是右表，然后对两列值进行分割，  
        //保存连接列在key值，剩余列和左右表标志在value中，最后输出  
        public void map(Object key,Text value,Context context) throws IOException,InterruptedException{  
            String line = value.toString();  
            int i = 0;  
            //输入文件首行，不处理  
            if(line.contains("factoryname")==true||line.contains("addressID")==true){  
                return;  
            }  
            //找出数据中的分割点  
            while(line.charAt(i)>='9'||line.charAt(i)<='0'){  
                i++;  
            }  
            if(line.charAt(i)>='9'||line.charAt(i)<='0'){  
                //左表  
                int j = i-1;  
                while(line.charAt(j)!=' ') j--;  
                String[] values = {line.substring(0, j),line.substring(i)};  
                context.write(new Text(values[1]), new Text("1+"+values[0]));  
            }else{//右表  
                int j = i+1;  
                while(line.charAt(j)!=' ') j++;  
                String[] values = {line.substring(0, i+1),line.substring(j)};  
                context.write(new Text(values[0]), new Text("2+"+values[1]));  
            }  
        }  
    }  
    public static class Reduce extends Reducer<Text,Text,Text,Text>{  
        //reduce解析map输出，将value中数据按照左右表分别保存，然后求笛卡尔积，输出  
        public void reduce(Text key,Iterable<Text> values,Context context) throws IOException,InterruptedException{  
            if(time == 0){//输入文件第一行  
                context.write(new Text("factoryname"),new Text("addressname"));  
                time++;  
            }  
            int factorynum = 0;  
            String factory[] = new String[10];  
            int adressnum = 0;  
            String adress[] = new String[10];  
            Iterator iter = values.iterator();  
            while(iter.hasNext()){  
                String record = iter.next().toString();  
                int len = record.length();  
                int i = 2;  
                char type = record.charAt(0);  
                String factoryname = new String();  
                String adressname = new String();  
                if(type == '1'){//左表  
                    factory[factorynum] = record.substring(2);  
                    factorynum++;  
                }else{//右表  
                    adress[adressnum] = record.substring(2);  
                }  
            }  
            if(factorynum!=0&&adressnum!=0){//笛卡尔积  
                for(int m=0;m<factorynum;m++){  
                    for(int n=0;n<adressnum;n++){  
                        context.write(new Text(factory[m]), new Text(adress[n]));  
                    }  
                }  
            }  
        }  
    }  
    /** 
     * @param args 
     */  
    public static void main(String[] args) throws Exception{  
        // TODO Auto-generated method stub  
        Configuration conf = new Configuration();  
        String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();  
        if(otherArgs.length != 2){  
            System.err.println("Usage WordCount <int> <out>");  
            System.exit(2);  
        }  
        Job job = new Job(conf,"word count");  
        job.setJarByClass(MoreTableMapReducer.class);  
        job.setMapperClass(Map.class);  
        job.setCombinerClass(Reduce.class);  
        job.setReducerClass(Reduce.class);  
        job.setOutputKeyClass(Text.class);  
        job.setOutputValueClass(Text.class);  
        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));  
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));  
        System.exit(job.waitForCompletion(true) ? 0 : 1);  
    }  
}

后来自己琢磨了一下，搞出来一个自己写的通俗版本的

输出结果为

factoryname	addressname
Back of Beijing	Beijing	1
Beijing Rising	Beijing	1
Beijing Red Star	Beijing	1
Guangzhou Development Bank	Guangzhou	2
Guangzhou Honda	Guangzhou	2
Tencent	Shenzhen	3
Shenzhen Thunder	Shenzhen	3

代码为

package HbaseCoprocessor;

import org.apache.hadoop.mapreduce.Mapper;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;

import java.util.*;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.GenericOptionsParser;

public class MTjoin {

	public static int time = 0;

	/*
	 * 
	 * 在map中先区分输入行属于左表还是右表，然后对两列值进行分割，
	 * 
	 * 保存连接列在key值，剩余列和左右表标志在value中，最后输出
	 */

	public static class Map extends Mapper<Object, Text, Text, Text> {
		// 实现map函数
		public void map(Object key, Text value, Context context)
		throws IOException, InterruptedException {
			String line = value.toString();// 每行文件
			// 输入文件首行，不处理									   
			if (line.contains("factoryname") == true || line.contains("addressID") == true) {
				return;
			}
		 	String[] arr=line.split("\t");
		 	//第几张表  根据表的字段来进行判断
		 	String tableNum="";
		 	if (arr[0].length()>1) {
				tableNum="1";
				context.write(new Text(arr[1]), new Text(tableNum + "_" + arr[0]));
			}else{
				tableNum="2";
				context.write(new Text(arr[0]), new Text(tableNum + "_" + arr[1]));
			}
		}

	}

	/*
	 * 
	 * reduce解析map输出，将value中数据按照左右表分别保存，
	 * 
	 * 　　* 然后求出笛卡尔积，并输出。
	 */

	public static class Reduce extends Reducer<Text, Text, Text, Text> {

		// 实现reduce函数

		public void reduce(Text key, Iterable<Text> values, Context context)

		throws IOException, InterruptedException {
			// 输出表头
			if (0 == time) {
				context.write(new Text("factoryname"), new Text("addressname"));
				time++;
			}		 
			Iterator ite = values.iterator();
			List<String> tableList=new ArrayList<String>();
			String str2="";
			while (ite.hasNext()) {
				String record = ite.next().toString();
				//切割表2的内容
				String[] arr2=record.split("_");
				if (record.contains("2")) {
					str2=arr2[1];
				}else if(record.contains("1")){
					tableList.add(arr2[1]);
				}
			}
			for (int i = 0; i < tableList.size(); i++) {
				context.write(new Text(tableList.get(i)), new Text(str2+"\t"+key.toString()));
				System.out.println(key.toString());
			}
			
		}
	}
	public static void deleteAllFilesOfDir(File path) {  
	    if (!path.exists())  
	        return;  
	    if (path.isFile()) {  
	        path.delete();  
	        return;  
	    }  
	    File[] files = path.listFiles();  
	    for (int i = 0; i < files.length; i++) {  
	        deleteAllFilesOfDir(files[i]);  
	    }  
	    path.delete();  
	} 
	public static void main(String[] args) throws Exception {

		File file= new File("F:\\hadoop\\out");
		 
		deleteAllFilesOfDir(file);
		
		
		Configuration conf = new Configuration();

		// 这句话很关键

		System.setProperty("hadoop.home.dir",
				"F:/alljar/hadoop-2.4.1-x64/hadoop-2.4.1");

		Job job = new Job(conf, "Multiple Table Join");

		job.setJarByClass(MTjoin.class);

		// 设置Map和Reduce处理类

		job.setMapperClass(Map.class);

		job.setReducerClass(Reduce.class);

		// 设置输出类型

		job.setOutputKeyClass(Text.class);

		job.setOutputValueClass(Text.class);

		// 设置输入和输出目录

		FileInputFormat.addInputPath(job, new Path("F:\\hadoop\\in"));

		FileOutputFormat.setOutputPath(job, new Path("F:\\hadoop\\out"));

		System.exit(job.waitForCompletion(true) ? 0 : 1);

	}

}

这本来是一个面试题。当时其实也想到了方法，但是面试者出的字段不清不楚，加上有点紧张一下子没理清楚思路，回来写了一下并不难，记录一下，下一次可以轻松面对了。也顺便想尝试一下3表的代码，逻辑似乎一下，hive的底层应该就是这样的

a2615381

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
MapReducer多表查询

一个例子晚上例子，别处看来的，但是有些代码看不懂表1factoryname addressedBeijing Red Star 1Shenzhen Thunder 3Guangzhou Honda 2Beijing Rising 1Guangzhou Development Bank 2Tencent 3Back of Beijing 1表2addr
复制链接

扫一扫