单表关联
"单表关联"要求从给出的数据中寻找所关心的数据,它是对原始数据所包含信息的挖掘。
实例描述:
输入child-parent表 得到grandchild-grandparent表
样例输入:
child parent
Tom Lucy
Tom Jack
Jone Lucy
Jone Jack
Lucy Mary
Lucy Ben
Jack Alice
Jack Jesse
Terry Alice
Terry Jesse
Philip Terry
Philip Alma
Mark Terry
Mark Alma
Tom Lucy
Tom Jack
Jone Lucy
Jone Jack
Lucy Mary
Lucy Ben
Jack Alice
Jack Jesse
Terry Alice
Terry Jesse
Philip Terry
Philip Alma
Mark Terry
Mark Alma
样例输出:
grandchild grandparent
Tom Alice
Tom Jesse
Jone Alice
Jone Jesse
Tom Mary
Tom Ben
Jone Mary
Jone Ben
Philip Alice
Philip Jesse
Mark Alice
Mark Jesse
Tom Alice
Tom Jesse
Jone Alice
Jone Jesse
Tom Mary
Tom Ben
Jone Mary
Jone Ben
Philip Alice
Philip Jesse
Mark Alice
Mark Jesse
这个实例要实现的是单表连接,连接的是左表的parent列和右表的child列,且左表和右表是同一个表。shuffle阶段会将相同的key连接在一起,所以将map结果的key设置成待连接的列,这样列中相同的值就自然会连接在一起了。
实现步骤:
1.在map中将读入数据分割成child和parent后,将parent作为key,child 和parent作为value输出,作为左表。再将相同的数据中的child作为key,child parent作为value输出,作为右表。
2.为了区分左右表,在输出的value值中加入一个标记。比如在value开始的位置加一个String标记 “1”表示左表,“2”表示右表。
3.reduce收到连接后的结果,key(以“parent”作为key)中value-list中就存在grandchild-grandparent关系,取出value-list的值解析,将左表的child信息加入一个数组,右表中parent信息加入一个数组,两个数组的笛卡尔积就是结果。
过程演算过程:
tom lucy
tom jack
jone lucy
jone jack
lucy mary
lucy ben
jack alice
jack jesse
map输出:
tom lucy
tom jack
jone lucy
jone jack
lucy mary
lucy ben
jack alice
jack jesse
map输出:
tom 2 tom lucy
lucy 1 tom lucy
tom 2 tom jack
jack 1 tom jack
jone 2 jone lucy
lucy 1 jone lucy
jone 2 jone jack
jack 1 jone jack
lucy 2 lucy mary
mary 1 lucy mary
lucy 2 lucy ben
ben 1 lucy ben
jack 2 jack alice
alice 1 jack alice
jack 2 jack jesse
jesse 1 jack jesse
reduce过程:
jack 1 tom jack 1 jone jack 2 jack alice 2 jack jesse
jone 2 jone lucy 2 jone jack
lucy 1 tom lucy 1 jone lucy 2 lucy mary 2 lucy ben
tom 2 tom lucy 2 tom jack
处理数据放入两个数组:
tom jone
alice jesse
笛卡尔积就是结果。
代码实现:
package mapreduce;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import mapreduce.sort.MyMapper;
import mapreduce.sort.MyReduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class ST {
static String INPUT_PATH="hdfs://master:9000/ti";
static String OUTPUT_PATH="hdfs://master:9000/output";
static class MyMapper extends Mapper<Object, Object, Text, Text>{
Text output_key=new Text();
Text output_value=new Text();
protected void map(Object key,Object value,Context context) throws IOException, InterruptedException{
// int i=Integer.parseInt(value.toString().trim());
// outputkey.set(i);
String[] str=value.toString().split(",");
output_key.set(str[0]);
output_value.set(2+","+value);
context.write(output_key, output_value);
output_key.set(str[1]);
output_value.set(1+","+value);
context.write(output_key,output_value);
}
}
static class MyReduce extends Reducer<Text, Text, Text, Text>{
Text outputkey=new Text();
Text outputvalue=new Text();
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException{
List<String> childs=new ArrayList();
List<String> grands=new ArrayList();
System.out.println("key:"+key);
for(Text line:values){
String[] str=line.toString().split(",");
if(str[0].equals("1")){
childs.add(str[1]);
System.out.println("111111111"+str[1]);
}
else if(str[0].equals("2")){
grands.add(str[2]);
System.out.println("22222222"+str[2]);
}
}
for(String a:childs){
System.out.println("-------"+a);
for(String b:grands){
System.out.println("+++++++"+b);
outputkey.set(a);
outputvalue.set(b);
context.write(outputkey,outputvalue);
}
}
System.out.println("end000000000");
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
// TODO Auto-generated method stub
Path outputpath=new Path(OUTPUT_PATH);
Configuration conf=new Configuration();
Job job=Job.getInstance(conf);
FileInputFormat.setInputPaths(job, INPUT_PATH);
FileOutputFormat.setOutputPath(job,outputpath);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReduce.class);
// job.setPartitionerClass(MyPartitioner.class);
// job.setNumReduceTasks(2);
// job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.waitForCompletion(true);
}
}
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import mapreduce.sort.MyMapper;
import mapreduce.sort.MyReduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class ST {
static String INPUT_PATH="hdfs://master:9000/ti";
static String OUTPUT_PATH="hdfs://master:9000/output";
static class MyMapper extends Mapper<Object, Object, Text, Text>{
Text output_key=new Text();
Text output_value=new Text();
protected void map(Object key,Object value,Context context) throws IOException, InterruptedException{
// int i=Integer.parseInt(value.toString().trim());
// outputkey.set(i);
String[] str=value.toString().split(",");
output_key.set(str[0]);
output_value.set(2+","+value);
context.write(output_key, output_value);
output_key.set(str[1]);
output_value.set(1+","+value);
context.write(output_key,output_value);
}
}
static class MyReduce extends Reducer<Text, Text, Text, Text>{
Text outputkey=new Text();
Text outputvalue=new Text();
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException{
List<String> childs=new ArrayList();
List<String> grands=new ArrayList();
System.out.println("key:"+key);
for(Text line:values){
String[] str=line.toString().split(",");
if(str[0].equals("1")){
childs.add(str[1]);
System.out.println("111111111"+str[1]);
}
else if(str[0].equals("2")){
grands.add(str[2]);
System.out.println("22222222"+str[2]);
}
}
for(String a:childs){
System.out.println("-------"+a);
for(String b:grands){
System.out.println("+++++++"+b);
outputkey.set(a);
outputvalue.set(b);
context.write(outputkey,outputvalue);
}
}
System.out.println("end000000000");
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
// TODO Auto-generated method stub
Path outputpath=new Path(OUTPUT_PATH);
Configuration conf=new Configuration();
Job job=Job.getInstance(conf);
FileInputFormat.setInputPaths(job, INPUT_PATH);
FileOutputFormat.setOutputPath(job,outputpath);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReduce.class);
// job.setPartitionerClass(MyPartitioner.class);
// job.setNumReduceTasks(2);
// job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.waitForCompletion(true);
}
}