hadoop学习--多表关联

最新推荐文章于 2018-12-09 15:18:28 发布

我非英雄

最新推荐文章于 2018-12-09 15:18:28 发布

阅读量1k

点赞数

分类专栏： Hadoop 文章标签： Hadoop

本文链接：https://blog.csdn.net/y521263/article/details/22495053

版权

Hadoop 专栏收录该内容

18 篇文章 0 订阅

订阅专栏

本例从多个表中提取出所需要的信息。

输入是2个文件，一个表示工厂表，包含工厂名和地址编号；另一个表示地址表，包含地址名和地址编号。根据2个表的信息输出工厂名-地址名表。

factory.txt:

factoryname	addressed
Beijing Red Star	1
Shenzhan Thunder	3
Guangzhou Honda	2
Beijing Rising	1
Guangzhou Development Bank	2
Tencent	3
Bank of Beijing	1

address.txt:

addressID	addressname
1	Beijing
2	Guangzhou
3	Shenzhen
5	Hangzhou

输出：

factoryname	addressname
Beijing Red Star	Beijing
Beijing Rising	Beijing
Bank of Beijing	Beijing
Guangzhou Honda	Guangzhou
Guangzhou Development Bank	Guangzhou
Shenzhan Thunder	Shenzhen
Tencent	Shenzhen

1、设计思路

在map阶段，对于每个输入以adressID为key进行保存；

来自factory.txt则存为:

<1,2:Beijing Red Star>

<3,2:Shenzhan Thunder>

...

来自address.txt则存为

<1,1:Beijing>

<2,1:Guangzhou>

...

这里的value开头的1： 2：用来区分来自不同的表，将在reduce中用到。

在reduce阶段

对于相同的key，保存相应的factoryname和addressname。具体细节可参考上一篇单表关联部分。

2、程序代码

import java.io.IOException;
import java.util.StringTokenizer;
import java.util.*;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class MTjoin {

  public static class TokenizerMapper 
       extends Mapper<Object, Text, Text, Text>{
      
    String tabletype = new String();
    String childname = new String();
    String parentname = new String();
    public void map(Object key, Text value, Context context
                    ) throws IOException, InterruptedException {
      String line = value.toString();
      int i = 0;
      if(line.contains("factoryname") == true || line.contains("addressID") == true)
      {
      	return;
      }
      while(line.charAt(i) > '9' || line.charAt(i) < '0')
      {
      	i++;
      }
      if(i == 0)		//address
      {
      	int j = i + 1;
      	while(line.charAt(j) != ' ')j++;
      	String[] values = {line.substring(0,j),line.substring(j+1)};
      	context.write(new Text(values[0]),new Text("1:" + values[1]));
      }
      else				//name
      {
      	int j = i - 1;
      	while(line.charAt(j) != ' ')j--;
      	String[] values = {line.substring(0,j),line.substring(i)};
      	context.write(new Text(values[1]),new Text("2:" + values[0]));
      }
    }
  }
  
  public static class IntSumReducer 
       extends Reducer<Text,Text,Text,Text> {
    private IntWritable result = new IntWritable();
	int count = 0;

    public void reduce(Text key, Iterable<Text> values, 
                       Context context
                       ) throws IOException, InterruptedException {
      if(count == 0)
      {
      	context.write(new Text("factoryname"),new Text("addressname"));
      	count++;
      }
      int factorynum = 0;
	  int addressnum = 0;
	  String[] factoryname = new String[10];
	  String[] addressname = new String[10];
      String strrecord = new String();
      String[] strArr = new String[3];
      Iterator ite = values.iterator();
      while(ite.hasNext())
      {
      	strrecord = ite.next().toString();
      	if(strrecord.length()<=0)
      	{
      		continue;
      	}
 		char type = strrecord.charAt(0);
      	if(type == '1')
      	{
      		addressname[addressnum++] = strrecord.substring(2);
      	}
      	else if(type == '2')
      	{
      		factoryname[factorynum++] = strrecord.substring(2);
      	}
      }
      if(factorynum != 0 && addressnum != 0)
      {
      	for(int i = 0;i < factorynum;i++)
      	{
      		for(int j = 0; j < addressnum;j++)
      		{
      			context.write(new Text(factoryname[i]),new Text(addressname[j]));
      		}
      	}
      }
      
    }
  }

  public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 3) {
      System.err.println("Usage: MTjoin <in> <out>");
      System.exit(2);
    }
    Job job = new Job(conf, "MTjoin");
    job.setJarByClass(MTjoin.class);
    job.setMapperClass(TokenizerMapper.class);
    //job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileInputFormat.addInputPath(job, new Path(otherArgs[1]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
  }
}

参考资料《hadoop实战》陆嘉恒