Reduce端的Join
原数据:
//orders表
1001,20150710,p0001,2
1002,20150710,p0002,3
//product表
p0001,小米5,1000,2000
p0002,锤子T1,1000,3000
按照商品编号将两表连接起来,效果:
key value
p0001 p0001,小米5,1000,2000 1001,20150710,p0001,2
p0002 p0002,锤子T1,1000,3000 1002,20150710,p0002,3
-
mapper类
在map中对来自不同数据文件的数据进行不同的转换
package com.bigdata.mapreduce.reducejoin; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import java.io.IOException; public class j_mapper extends Mapper<LongWritable, Text,Text,Text> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //1. 判断数据来自哪个文件 FileSplit filesplit = (FileSplit) context.getInputSplit(); String name = filesplit.getPath().getName(); //2. 转换 if(name.equals("orders.txt")){ Text text = new Text(value.toString().split(",")[2]); context.write(text,value); }else{ Text text = new Text(value.toString().split(",")[0]); context.write(text,value); } } }
-
reducer类
将数据进行拼接
package com.bigdata.mapreduce.reducejoin; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; public class j_reducer extends Reducer<Text,Text,Text,Text> { @Override protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { String first = ""; for (Text value : values){ first = first + value + "\t"; } context.write(key,new Text(first)); } }
-
jobmain
package com.bigdata.mapreduce.reducejoin; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; public class join_jobmain extends Configured implements Tool { @Override public int run(String[] strings) throws Exception { //创建job任务对象 Job job = Job.getInstance(super.getConf(), "join"); //配置job任务 //配置输入类和路径 job.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(job,new Path("file:///D:\\mapreduce_demo\\double")); //配置mapper和K2,V2类型 job.setMapperClass(j_mapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); //分区、排序、规约、分组 //指定reducer类和K3,V3类型 job.setReducerClass(j_reducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); //设置输出类型和路径 job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job,new Path("file:///D:\\mapreduce_demo\\joinresult")); //等待任务结束 boolean flag = job.waitForCompletion(true); return flag?0:1; } public static void main(String[] args) throws Exception { Configuration entries = new Configuration(); int run = ToolRunner.run(entries, new join_jobmain(), args); System.exit(run); } }
Map端Join
-
在mapper端进行join重写setup方法和map方法
package com.bigdata.mapreduce.mapjoin; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.URI; import java.util.HashMap; public class mj_mapper extends Mapper<LongWritable,Text,Text, Text> { private HashMap<String, String> map = new HashMap<>(); //1. 将分布式缓存的小表数据读取到本地map集合 @Override protected void setup(Context context) throws IOException, InterruptedException { //1. 获取分布式缓存列表 URI[] cacheFiles = context.getCacheFiles(); //2. 获取指定的分布式缓存文件的文件系统 FileSystem fileSystem = FileSystem.get(cacheFiles[0], context.getConfiguration()); //3. 获取文件的输入流 FSDataInputStream open = fileSystem.open(new Path(cacheFiles[0])); //4. 读取文件内容,并存入map集合 //字节输入流转换为缓存字符流 BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(open)); //读取数据存入map集合中 String line = null; while((line = bufferedReader.readLine()) != null) { String[] split = line.split(","); map.put(split[0],line); } //5. 关闭流 bufferedReader.close(); fileSystem.close(); } //2. 对大表的处理,实现大小表join @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //大表处理 String[] split = value.toString().split(","); String s = split[2]; //大小表join String Linedata = map.get(s); String data = Linedata + "\t" + value.toString(); context.write(new Text(s),new Text(data)); } }
-
jobmain
package com.bigdata.mapreduce.mapjoin; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import java.net.URI; public class mj_jobmain extends Configured implements Tool { @Override public int run(String[] strings) throws Exception { //获取job对象 Job job = Job.getInstance(super.getConf(), "maphoin"); //设置job对象 //将小表放在分布式缓存中 job.addCacheFile(new URI("hdfs://")); //设置输入类和输入路径 job.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(job, new Path("file:///D:\\mapreduce_demo\\double\\orders.txt")); //设置mapper类 job.setMapperClass(mj_mapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); //分区、排序、规约、分组 //设置reducer类 //设置输出类和路径 job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job,new Path("file:///D:\\mapreduce_demo\\mapjoin")); //等待任务结束 boolean flag = job.waitForCompletion(true); return flag?0:1; } public static void main(String[] args) throws Exception { Configuration entries = new Configuration(); int run = ToolRunner.run(entries, new mj_jobmain(), args); System.exit(run); } }
案例
需求:求两个人的共同好友
原数据:
A:B,C,D,F,E,O
B:A,C,E,K
C:F,A,D,I
D:A,E,F,L
E:B,C,D,M,L
F:A,B,C,D,E,O,M
G:A,C,D,E,F
H:A,C,D,E,O
I:A,O
J:B,O
K:A,C,D
L:D,E,F
M:E,F,G
O:A,H,I,J
思路:写两个mapreduce完成需求
第一个:mapper:将冒号左边的数据作为V2,将冒号右边的每一个数据都做为K2,将数据转换为:
reducer:相同的K2的数据组成一个集合,遍历集合将数据用**”—“**连接起来:
并将K2作为V3,连接后的数据作为K3:
第二个:mapper,将第一个mapreduce的输出数据读取,按\t分割将map1的K3两两连接形成K2,map1的V3作为V2:
reducer:将map2的K2作为K3,遍历map2的V2将数据用**”-“**连接起来
代码:
-
mapper1
package com.bigdata.mapreduce.firends; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; public class setpMapper1 extends Mapper<LongWritable, Text,Text,Text> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //以冒号拆分数据,0索引就是V2 String[] split = value.toString().split(":"); //以逗号拆分1索引数据,遍历列表,每个元素就是K2 for (String i : split[1].split(",")){ context.write(new Text(i),new Text(split[0])); } } }
-
reducer1
package com.bigdata.mapreduce.firends; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; public class setpReducer1 extends Reducer<Text,Text,Text,Text> { @Override protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { StringBuilder builder = new StringBuilder(); for (Text value : values){ builder.append(value).append("-"); } int length = builder.toString().length()-1; String k3 = builder.toString().substring(0, length); context.write(new Text(k3),key); } }
-
jobmain1
package com.bigdata.mapreduce.firends; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; public class septjobmain1 extends Configured implements Tool { @Override public int run(String[] strings) throws Exception { Job job = Job.getInstance(super.getConf(), "sept1"); job.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(job,new Path("file:///D:\\mapreduce_demo\\input\\firends.txt")); job.setMapperClass(setpMapper1.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setReducerClass(setpReducer1.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job,new Path("file:///D:\\mapreduce_demo\\firends")); boolean flag = job.waitForCompletion(true); return flag?0:1; } public static void main(String[] args) throws Exception { Configuration entries = new Configuration(); int run = ToolRunner.run(entries, new septjobmain1(), args); System.exit(run); } }
-
mapper2
package com.bigdata.mapreduce.firends; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.IOException; import java.lang.reflect.Array; import java.util.Arrays; public class septmapper2 extends Mapper<LongWritable, Text,Text,Text> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { //拆分行文本数据并对数据进行排序 String[] split1 = value.toString().split("\t"); String[] split = split1[0].split("-"); Arrays.sort(split); //将数据两两组合并写入上下文数据 for (int i = 0; i < split.length ; i++) { for (int j = i+1; j < split.length ; j++) { String k2 = split[i] + "-" + split[j]; context.write(new Text(k2),new Text(split1[1])); } } } }
-
reducer2
package com.bigdata.mapreduce.firends; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; public class septreducer2 extends Reducer<Text,Text,Text,Text> { @Override protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { StringBuilder builder = new StringBuilder(); for (Text value : values){ builder.append(value).append("-"); } int length = builder.toString().length()-1; String k3 = builder.toString().substring(0, length); context.write(key,new Text(k3)); } }
-
jobmain2
package com.bigdata.mapreduce.firends; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; public class septjobmain2 extends Configured implements Tool { @Override public int run(String[] strings) throws Exception { Job job = Job.getInstance(super.getConf(), "sept2"); job.setInputFormatClass(TextInputFormat.class); TextInputFormat.addInputPath(job,new Path("file:///D:\\mapreduce_demo\\firends\\part-r-00000")); job.setMapperClass(septmapper2.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setReducerClass(septreducer2.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job,new Path("file:///D:\\mapreduce_demo\\firends2")); boolean flag = job.waitForCompletion(true); return flag?0:1; } public static void main(String[] args) throws Exception { Configuration entries = new Configuration(); int run = ToolRunner.run(entries, new septjobmain2(), args); System.exit(run); } }
MapReduce知识点(1)
MapReduce知识点(2)
MapReduce知识点(3)
MapReduce知识点(4)