本例从多个表中提取出所需要的信息。
输入是2个文件,一个表示工厂表,包含工厂名和地址编号;另一个表示地址表,包含地址名和地址编号。根据2个表的信息输出工厂名-地址名表。
factory.txt:
factoryname | addressed |
Beijing Red Star | 1 |
Shenzhan Thunder | 3 |
Guangzhou Honda | 2 |
Beijing Rising | 1 |
Guangzhou Development Bank | 2 |
Tencent | 3 |
Bank of Beijing | 1 |
address.txt:
addressID | addressname |
1 | Beijing |
2 | Guangzhou |
3 | Shenzhen |
5 | Hangzhou |
输出:
factoryname | addressname |
Beijing Red Star | Beijing |
Beijing Rising | Beijing |
Bank of Beijing | Beijing |
Guangzhou Honda | Guangzhou |
Guangzhou Development Bank | Guangzhou |
Shenzhan Thunder | Shenzhen |
Tencent | Shenzhen |
在map阶段,对于每个输入以adressID为key进行保存;
来自factory.txt则存为:
<1,2:Beijing Red Star>
<3,2:Shenzhan Thunder>
...
来自address.txt则存为
<1,1:Beijing>
<2,1:Guangzhou>
...
这里的value开头的1: 2: 用来区分来自不同的表,将在reduce中用到。
在reduce阶段
对于相同的key,保存相应的factoryname和addressname。具体细节可参考上一篇单表关联部分。
2、程序代码
import java.io.IOException;
import java.util.StringTokenizer;
import java.util.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class MTjoin {
public static class TokenizerMapper
extends Mapper<Object, Text, Text, Text>{
String tabletype = new String();
String childname = new String();
String parentname = new String();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
String line = value.toString();
int i = 0;
if(line.contains("factoryname") == true || line.contains("addressID") == true)
{
return;
}
while(line.charAt(i) > '9' || line.charAt(i) < '0')
{
i++;
}
if(i == 0) //address
{
int j = i + 1;
while(line.charAt(j) != ' ')j++;
String[] values = {line.substring(0,j),line.substring(j+1)};
context.write(new Text(values[0]),new Text("1:" + values[1]));
}
else //name
{
int j = i - 1;
while(line.charAt(j) != ' ')j--;
String[] values = {line.substring(0,j),line.substring(i)};
context.write(new Text(values[1]),new Text("2:" + values[0]));
}
}
}
public static class IntSumReducer
extends Reducer<Text,Text,Text,Text> {
private IntWritable result = new IntWritable();
int count = 0;
public void reduce(Text key, Iterable<Text> values,
Context context
) throws IOException, InterruptedException {
if(count == 0)
{
context.write(new Text("factoryname"),new Text("addressname"));
count++;
}
int factorynum = 0;
int addressnum = 0;
String[] factoryname = new String[10];
String[] addressname = new String[10];
String strrecord = new String();
String[] strArr = new String[3];
Iterator ite = values.iterator();
while(ite.hasNext())
{
strrecord = ite.next().toString();
if(strrecord.length()<=0)
{
continue;
}
char type = strrecord.charAt(0);
if(type == '1')
{
addressname[addressnum++] = strrecord.substring(2);
}
else if(type == '2')
{
factoryname[factorynum++] = strrecord.substring(2);
}
}
if(factorynum != 0 && addressnum != 0)
{
for(int i = 0;i < factorynum;i++)
{
for(int j = 0; j < addressnum;j++)
{
context.write(new Text(factoryname[i]),new Text(addressname[j]));
}
}
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 3) {
System.err.println("Usage: MTjoin <in> <out>");
System.exit(2);
}
Job job = new Job(conf, "MTjoin");
job.setJarByClass(MTjoin.class);
job.setMapperClass(TokenizerMapper.class);
//job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileInputFormat.addInputPath(job, new Path(otherArgs[1]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
参考资料《hadoop实战》陆嘉恒