hadoop学习--多表关联

本例从多个表中提取出所需要的信息。

输入是2个文件,一个表示工厂表,包含工厂名和地址编号;另一个表示地址表,包含地址名和地址编号。根据2个表的信息输出工厂名-地址名表。

factory.txt:

factorynameaddressed
Beijing Red Star1
Shenzhan Thunder3
Guangzhou Honda2
Beijing Rising1
Guangzhou Development Bank2
Tencent3
Bank of Beijing1

address.txt:

addressIDaddressname
1Beijing
2Guangzhou
3Shenzhen
5Hangzhou

输出:

factorynameaddressname
Beijing Red StarBeijing
Beijing RisingBeijing
Bank of BeijingBeijing
Guangzhou HondaGuangzhou
Guangzhou Development BankGuangzhou
Shenzhan ThunderShenzhen
TencentShenzhen
1、设计思路

在map阶段,对于每个输入以adressID为key进行保存;

来自factory.txt则存为:

<1,2:Beijing Red Star>

<3,2:Shenzhan Thunder>

...

来自address.txt则存为

<1,1:Beijing>

<2,1:Guangzhou>

...

这里的value开头的1: 2: 用来区分来自不同的表,将在reduce中用到。


在reduce阶段

对于相同的key,保存相应的factoryname和addressname。具体细节可参考上一篇单表关联部分。

2、程序代码

import java.io.IOException;
import java.util.StringTokenizer;
import java.util.*;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class MTjoin {

  public static class TokenizerMapper 
       extends Mapper<Object, Text, Text, Text>{
      
    String tabletype = new String();
    String childname = new String();
    String parentname = new String();
    public void map(Object key, Text value, Context context
                    ) throws IOException, InterruptedException {
      String line = value.toString();
      int i = 0;
      if(line.contains("factoryname") == true || line.contains("addressID") == true)
      {
      	return;
      }
      while(line.charAt(i) > '9' || line.charAt(i) < '0')
      {
      	i++;
      }
      if(i == 0)		//address
      {
      	int j = i + 1;
      	while(line.charAt(j) != ' ')j++;
      	String[] values = {line.substring(0,j),line.substring(j+1)};
      	context.write(new Text(values[0]),new Text("1:" + values[1]));
      }
      else				//name
      {
      	int j = i - 1;
      	while(line.charAt(j) != ' ')j--;
      	String[] values = {line.substring(0,j),line.substring(i)};
      	context.write(new Text(values[1]),new Text("2:" + values[0]));
      }
    }
  }
  
  public static class IntSumReducer 
       extends Reducer<Text,Text,Text,Text> {
    private IntWritable result = new IntWritable();
	int count = 0;

    public void reduce(Text key, Iterable<Text> values, 
                       Context context
                       ) throws IOException, InterruptedException {
      if(count == 0)
      {
      	context.write(new Text("factoryname"),new Text("addressname"));
      	count++;
      }
      int factorynum = 0;
	  int addressnum = 0;
	  String[] factoryname = new String[10];
	  String[] addressname = new String[10];
      String strrecord = new String();
      String[] strArr = new String[3];
      Iterator ite = values.iterator();
      while(ite.hasNext())
      {
      	strrecord = ite.next().toString();
      	if(strrecord.length()<=0)
      	{
      		continue;
      	}
 		char type = strrecord.charAt(0);
      	if(type == '1')
      	{
      		addressname[addressnum++] = strrecord.substring(2);
      	}
      	else if(type == '2')
      	{
      		factoryname[factorynum++] = strrecord.substring(2);
      	}
      }
      if(factorynum != 0 && addressnum != 0)
      {
      	for(int i = 0;i < factorynum;i++)
      	{
      		for(int j = 0; j < addressnum;j++)
      		{
      			context.write(new Text(factoryname[i]),new Text(addressname[j]));
      		}
      	}
      }
      
    }
  }

  public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 3) {
      System.err.println("Usage: MTjoin <in> <out>");
      System.exit(2);
    }
    Job job = new Job(conf, "MTjoin");
    job.setJarByClass(MTjoin.class);
    job.setMapperClass(TokenizerMapper.class);
    //job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileInputFormat.addInputPath(job, new Path(otherArgs[1]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
  }
}

参考资料《hadoop实战》陆嘉恒

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值