------------------------------------reduce端join------------------------------- package demo06.reducejoin; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import java.io.IOException; public class ReduceJoinMap extends Mapper<LongWritable, Text,Text,Text> { Text k2 = new Text(); /** * 进行分类,商品id作为k2行数据作为v2 * 获取文件名,如果入k1,v1的文件名是produce为商品,order为订单 * * */ @Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context) throws IOException, InterruptedException { //获取文件路径名 FileSplit inputSplit = (FileSplit) context.getInputSplit(); Path path = inputSplit.getPath(); String pathName = path.getName(); String line = value.toString(); //逻辑处理 if(pathName.startsWith("p")){ //商品表数据 String[] splits = line.split(","); k2.set(splits[0]); context.write(k2,value); }else{ String[] splits = line.split(","); k2.set(splits[2]); context.write(k2,value); } } }
package demo06.reducejoin; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; import java.io.IOException; public class ReduceJoinReduce extends Reducer<Text,Text,Text,NullWritable> { /** * 利用hadoop相同的key合并,多行数据变为集合的特性进行join,此时相同产品的产品数据和订单数据都在集合内, * 如果是订单数据就给orderLine,如果是商品数据就给productLine * 再进行拼接完成简单的join */ @Override protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException { String orderLine = ""; String productLine = ""; for (Text text: values) { if(text.toString().startsWith("p")){ productLine=text.toString(); } else{ orderLine=text.toString(); } } context.write(new Text(orderLine+"\t"+productLine),NullWritable.get()); } }
-----------------------------------------------------map端join-------------------------------------------
在主程序里添加缓存文件
//添加我们的缓存文件 DistributedCache.addCacheFile(new URI("hdfs://node01:8020/cachefile/pdts.txt"),configuration);
package demo07.mapjoin; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.URI; import java.util.HashMap; public class MapJoinMap extends Mapper<LongWritable, Text,Text, NullWritable> { //定义一个setup结果集保存数据 HashMap<String,String> map; /** * 重写setup方法获取缓存文件,将缓存文件保存在map中 * 重写setup方法,里面用文件系统来处理文件,将商品保存为一个集合 * 在map方法中与订单进行匹配 */ @Override protected void setup(Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException { map = new HashMap<String,String>(); //从context中获取configuration Configuration configuration = context.getConfiguration(); //我们只有一个缓存文件 URI[] caches = DistributedCache.getCacheFiles(configuration); URI cacheFile = caches[0]; //cacheFile是hdfs://node01:8020/..... //获取一个文件系统 FileSystem fileSystem = FileSystem.get(cacheFile,configuration); //获取文件的输入流 FSDataInputStream fsDataInputStream = fileSystem.open(new Path(cacheFile)); //流读取为字符串 BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(fsDataInputStream)); String line = bufferedReader.readLine(); String lineStr = null; while ((line = bufferedReader.readLine())!=null){ String[] lineArray = line.split(","); map.put(lineArray[0],line); } } @Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException { String[] splits = value.toString().split(","); //获取我们商品表的数据 String product = map.get(splits[2]); //将商品表和订单表中的数据进行拼接 context.write(new Text(value.toString()+"\t"+product),NullWritable.get()); } }