(一)MapReduce的Join连接操作
MapReduce连接的目的是把两个文件的内容,按连接条件进行连接,合并成一个文件
MapReduce连接的方法有两种:
1.在Map端连接
不需要向reduce端发送数据
适合于有一个表比较小的情况
具体操作:i 将较小的文章读入到内存(Map集合)
ii 读取较大的文件,得到连接字段,利用连接字段去Map集合中取指
iii 进行连接
2.在Reduce端连接
利用key将相同记录合并的特点,用连接条件做key
制作一个实体,包含两个表的所有内容,在通过isXXX属性区分是哪个表
在reduce端取值、判断、合并
在Map端连接
import com.sun.scenario.effect.impl.sw.sse.SSEBlend_SRC_OUTPeer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.*;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Map;
/*
*/
public class ForMapJoin {
public static class ForMapper extends Mapper<LongWritable,Text,Text,NullWritable>{
private Map<String,String> cacheMap=new HashMap<String,String>();
private Text oKey=new Text();
private NullWritable oValue=NullWritable.get();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
//取出缓存文件的URI
URI uri=context.getCacheFiles()[0];
//创建
File file=new File(uri);
FileReader fr=new FileReader(file);
BufferedReader br=new BufferedReader(fr);
String temp;
while((temp=br.readLine())!=null){
String []strs=temp.split("\t");
cacheMap.put(strs[0],strs[1]);
}
for(Map.Entry<String,String> entry:cacheMap.entrySet()){
System.err.println(entry.getKey()+"\t"+entry.getValue());
}
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String []strs=value.toString().split("\t");
if(cacheMap.get(strs[2])==null){
System.out.println(strs[2]);
}
String phoneInfo=value.toString()+"\t"+cacheMap.get(strs[2]);
oKey.set(phoneInfo);
context.write(oKey,oValue);
}
}
public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {
Job job=Job.getInstance();
job.setMapperClass(ForMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
FileSystem fileSystem=FileSystem.get(new URI("file:E://output"),new Configuration());
Path path=new Path("E://output");
if(fileSystem.exists(path)){
fileSystem.delete(path,true);
}
//设置加载到内存中的文件
job.addCacheFile(new URI("file:///E:/forTestData/jionData/phoneinfo1.txt"));
//或者job.setCacheFiles(URI []files);
FileInputFormat.addInputPath(job,new Path("E:\\forTestData\\jionData\\map\\userinfo.txt"));
FileOutputFormat.setOutputPath(job,path);
job.waitForCompletion(true);
}
}
在Reduce端连接
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
/*
将product表和order表根据商品id连接后输出
*/
public class ForReducerJoin {
public static class ForMapper extends Mapper<LongWritable,Text,Text,ProductAndOrder>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
Text oKey=new Text();
ProductAndOrder oValue=new ProductAndOrder();
String line=value.toString();
String []strs=line.split("\t");
if(strs.length<4) return;
//获得当前的文件分片
FileSplit fileSplit= (FileSplit) context.getInputSplit();
//获得当前读取的文件名
String fileName=fileSplit.getPath().getName();
if("product.txt".equals(fileName)){
oValue.setProductId(strs[0]);
oValue.setProductName(strs[1]);
oValue.setProductPrice(Integer.parseInt(strs[3]));
oValue.setProductType(strs[2]);
oValue.setPruduct(true);
oKey.set(strs[0]);
}else if("order.txt".equals(fileName)){
oValue.setOrderDate(strs[1]);
oValue.setOrderId(strs[0]);
oValue.setOrderNum(Integer.parseInt(strs[2]));
oValue.setPruduct(false);
oKey.set(strs[3]);
}
context.write(oKey,oValue);
}
}
public static class ForReducer extends Reducer<Text,ProductAndOrder,Text,NullWritable>{
@Override
protected void reduce(Text key, Iterable<ProductAndOrder> values, Context context) throws IOException, InterruptedException {
List<ProductAndOrder> orders= new ArrayList<ProductAndOrder>();
ProductAndOrder product=new ProductAndOrder();
for(ProductAndOrder entity:values){
if(entity.isPruduct()){
product.setProductId(entity.getProductId());
product.setProductName(entity.getProductName());
product.setProductType(entity.getProductType());
product.setProductPrice(entity.getProductPrice());
}else{
ProductAndOrder order=new ProductAndOrder();
order.setOrderId(entity.getOrderId());
order.setOrderDate(entity.getOrderDate());
order.setOrderNum(entity.getOrderNum());
orders.add(order);
}
}
for(ProductAndOrder order:orders){
order.setProductId(product.getProductId());
order.setProductName(product.getProductName());
order.setProductType(product.getProductType());
order.setProductPrice(product.getProductPrice());
String info=order.toString();
context.write(new Text(info),NullWritable.get());
}
}
}
public static void main(String[] args) throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {
Job job= Job.getInstance();
job.setMapperClass(ForMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(ProductAndOrder.class);
job.setReducerClass(ForReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
FileSystem fileSystem=FileSystem.get(new URI("file:E://output"),new Configuration());
Path path=new Path("E://output");
if(fileSystem.exists(path)){
fileSystem.delete(path,true);
}
FileInputFormat.addInputPath(job,new Path("E:\\forTestData\\jionData\\reduce"));
FileOutputFormat.setOutputPath(job,path);
job.waitForCompletion(true);
}
}
(二)自定义key的合并规则
方法:编写一个类继承WritableComparator,并重写compare方法
而且要写一个无参的构造器,调用super(key的类型,true),不然可能会空指针异常
在通过job.setGroupingComparatorClass()设置自定义的合并规则