假设:HDFS上有2个文件,分别是客户信息和订单信息,customerID是它们之间的关联字段。如何进行关联计算,以便将客户名称添加到订单列表中?
一般方法是:输入2个源文件。根据文件名在Map中处理每条数据,如果是Order,则在foreign key上加标记”O”,形成combined key;如果是Customer则做标记”C”。Map之后的数据按照key分区,再按照combined key分组排序。最后在reduce中合并结果再输出。
实现代码:
01 | public static class JMapper extends Mapper<LongWritable, Text, TextPair, Text> { |
04 | protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { |
05 | String pathName = ((FileSplit) context.getInputSplit()).getPath().toString(); |
06 | if (pathName.contains( "order.txt" )) { |
07 | String values[] = value.toString().split( "\t" ); |
08 | TextPair tp = new TextPair( new Text(values[ 1 ]), new Text( "O" )); |
09 | context.write(tp, new Text(values[ 0 ] + "\t" + values[ 2 ])); |
11 | if (pathName.contains( "customer.txt" )) { |
12 | String values[] = value.toString().split( "\t" ); |
13 | TextPair tp = new TextPair( new Text(values[ 0 ]), new Text( "C" )); |
14 | context.write(tp, new Text(values[ 1 ])); |
1 | public static class JPartitioner extends Partitioner<TextPair, Text> { |
4 | public int getPartition(TextPair key, Text value, int numParititon) { |
5 | return Math.abs(key.getFirst().hashCode() * 127 ) % numParititon; |
01 | public static class JComparator extends WritableComparator { |
03 | public JComparator() { |
04 | super (TextPair. class , true ); |
06 | @SuppressWarnings ( "unchecked" ) |
07 | public int compare(WritableComparable a, WritableComparable b) { |
08 | TextPair t1 = (TextPair) a; |
09 | TextPair t2 = (TextPair) b; |
10 | return t1.getFirst().compareTo(t2.getFirst()); |
01 | public static class JReduce extends Reducer<TextPair, Text, Text, Text> { |
03 | protected void reduce(TextPair key, Iterable<Text> values, Context context) throws IOException,InterruptedException { |
04 | Text pid = key.getFirst(); |
05 | String desc = values.iterator().next().toString(); |
06 | while (values.iterator().hasNext()) { |
07 | context.write(pid, new Text(values.iterator().next().toString() + "\t" + desc)); |
01 | public class TextPair implements WritableComparable<TextPair> { |
06 | set( new Text(), new Text()); |
08 | public TextPair(String first, String second) { |
09 | set( new Text(first), new Text(second)); |
11 | public TextPair(Text first, Text second) { |
14 | public void set(Text first, Text second) { |
18 | public Text getFirst() { |
21 | public Text getSecond() { |
24 | public void write(DataOutput out) throws IOException { |
28 | public void readFields(DataInput in) throws IOException { |
30 | second.readFields(in); |
32 | public int compareTo(TextPair tp) { |
33 | int cmp = first.compareTo(tp.first); |
37 | return second.compareTo(tp.second); |
01 | public static void main(String agrs[]) throws IOException, InterruptedException, ClassNotFoundException { |
03 | Configuration conf = new Configuration(); |
04 | GenericOptionsParser parser = new GenericOptionsParser(conf, agrs); |
05 | String[] otherArgs = parser.getRemainingArgs(); |
06 | if (agrs.length < 3 ) { |
07 | System.err.println( "Usage: J <in_path_one> <in_path_two> <output>" ); |
10 | Job job = new Job(conf, "J" ); |
11 | job.setJarByClass(J. class ); |
12 | job.setMapperClass(JMapper. class ); |
13 | job.setMapOutputKeyClass(TextPair. class ); |
14 | job.setMapOutputValueClass(Text. class ); |
15 | job.setPartitionerClass(JPartitioner. class ); |
16 | job.setGroupingComparatorClass(JComparator. class ); |
17 | job.setReducerClass(Example_Join_01_Reduce. class ); |
18 | job.setOutputKeyClass(Text. class ); |
19 | job.setOutputValueClass(Text. class ); |
20 | FileInputFormat.addInputPath(job, new Path(otherArgs[ 0 ])); |
21 | FileInputFormat.addInputPath(job, new Path(otherArgs[ 1 ])); |
22 | FileOutputFormat.setOutputPath(job, new Path(otherArgs[ 2 ])); |
23 | System.exit(job.waitForCompletion( true ) ? 0 : 1 ); |
不能直接使用原始数据,而是要搞一堆代码处理标记,并绕过MapReduce原本的架构,最后从底层设计并计算数据之间的关联关系。这还是最简单的关联计算,如果用MapReduce进行多表关联或逻辑更复杂的关联计算,复杂度会呈几何级数递增。