使用mapreduce实现join操作:
mapjoin和reducejoin有什么区别?
mapjoin是会利用cachefile接入数据,与map端接入的数据进行逻辑关联,不需要写reducer(不代表没有shuffle和reduce的过程)
reducejoin是map端只完成文件合并,利用相同的关联条件(id)作为key,输出到reduce端,reduce端根据key聚合达到关联的效果
数据文件:
customers.csv
orders.csv
reduce端join:
1.编写CustomOrder类:
实现writable接口,重写write和readfields方法,要求序列化与反序列化的顺序一致,提供get和set方法,重写toString方法。
package cn.kgc.kb09.join;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* @Author: ChaoKeAiMuZhi
* @Date: 2020/9/10 14:48
* @Description:
**/
public class CustomOrder implements Writable {
private String customId;//客户id
private String customName;//客户名
private String orderId;//订单id
private String orderStatus;//订单状态
private String tableFlag;//标志位(是map识别文件的标志,为0时是custom表,为1时是order表)
//序列化
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(customId);
out.writeUTF(customName);
out.writeUTF(orderId);
out.writeUTF(orderStatus);
out.writeUTF(tableFlag==null?"":tableFlag);
}
//反序列化(序列化与反序列化的顺序要求一致)
@Override
public void readFields(DataInput in) throws IOException {
this.customId=in.readUTF();
this.customName=in.readUTF();
this.orderId=in.readUTF();
this.orderStatus=in.readUTF();
this.tableFlag=in.readUTF();
}
//参数的get和set方法
public String getCustomId() { return customId; }
public void setCustomId(String customId) { this.customId = customId; }
public String getCustomName() { return customName; }
public void setCustomName(String customName) { this.customName = customName; }
public String getOrderId() { return orderId; }
public void setOrderId(String orderId) { this.orderId = orderId; }
public String getOrderStatus() { return orderStatus; }
public void setOrderStatus(String orderStatus) { this.orderStatus = orderStatus; }
public String getTableFlag() { return tableFlag; }
public void setTableFlag(String tableFlag) { this.tableFlag = tableFlag; }
//重写toString方法
@Override
public String toString() {
return "customId='" + customId + '\'' +
", customName='" + customName + '\'' +
", orderId='" + orderId + '\'' +
", orderStatus=" + orderStatus+ '\'';
}
}
2.编写COMapperJoin类:
1.将字段按指定规则切割,生成split数组
2.在有多表的情况下,根据数组长度,判断对应的表
3.new一个实体类的对象,对可赋值的字段进行赋值
4.从map端写出,以key,value的形式
package cn.kgc.kb09.join;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* @Author: ChaoKeAiMuZhi
* @Date: 2020/9/10 14:41
* @Description:
**/
public class COMapperJoin extends Mapper<LongWritable, Text,Text, CustomOrder> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//将字段按","进行split切割,生成columns数组
String line=value.toString();
String[] columns = line.split(",");
CustomOrder co = new CustomOrder();
//进行判断,columns数组长度为4的是订单表,数组长度为9的是顾客表
if(columns.length==4){
//订单表中可赋值的字段进行赋值
co.setCustomId(columns[2]);
co.setCustomName("");
co.setOrderId(columns[0]);
co.setOrderStatus(columns[3]);
co.setTableFlag("1");
}else if(columns.length==9){
//顾客表中可赋值的字段进行赋值
co.setCustomId(columns[0]);
co.setCustomName(columns[1]+"·"+columns[2]);
co.setOrderId("");
co.setOrderStatus("");
co.setTableFlag("0");
}
//从Map端写出{1,{EmpDept(1,xxx,,,0),EmpDept(1,,20,closed,1)}}
context.write(new Text(co.getCustomId()),co);
}
}
3.1 编写COReducerJoin类
因为map传递过来的key是以customId进行分组的,顾客表只有姓名和customId,订单表只有customId,订单编号orderId和状态orderStatus,分别对对象进行赋值,订单编号和订单状态进行字符串追加。将一组记录最终合并成一条记录。
package cn.kgc.kb09.join;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* @Author: ChaoKeAiMuZhi
* @Date: 2020/9/10 15:16
* @Description:
**/
public class COReducerJoin extends Reducer<Text,CustomOrder,CustomOrder, NullWritable> {
@Override
protected void reduce(Text key, Iterable<CustomOrder> values, Context context) throws IOException, InterruptedException {
StringBuffer orderIds=new StringBuffer();
StringBuffer statuses=new StringBuffer();
//每一组记录对应一个custom对象
CustomOrder customOrder=new CustomOrder();
//对每组数据进行遍历,分别给对象的属性赋值
for(CustomOrder co : values){
if(co.getCustomName().equals("")){
orderIds.append(co.getOrderId()+"|");
statuses.append(co.getOrderStatus()+"|");
}else{
customOrder.setCustomId(co.getCustomId());
customOrder.setCustomName(co.getCustomName());
}
}
//将append追加完成的Stringbuffer字符串,赋值给customOrder的orderid和orderstatus属性
customOrder.setOrderId(orderIds.toString());
customOrder.setOrderStatus(statuses.toString());
//调用写出方法
context.write(customOrder,NullWritable.get());
}
}
3.2 编写COReduceJoinTest类
package cn.kgc.kb09.join;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
/**
* @Author: ChaoKeAiMuZhi
* @Date: 2020/9/11 8:13
* @Description:
**/
public class COReduceJoinTest extends Reducer<Text,CustomOrder,CustomOrder, NullWritable> {
@Override
protected void reduce(Text key, Iterable<CustomOrder> values, Context context) throws IOException, InterruptedException {
//1.准备订单记录集合(因为一个顾客id可对应多个订单数据,所以需要一个集合存放)
ArrayList<CustomOrder> ordeBeans = new ArrayList<>();
//准备顾客bean对象(因为顾客id只对应一个顾客,同一个id应该只有一个顾客对象,需要一个对象来存放)
CustomOrder cusBean = new CustomOrder();
//2.遍历map端输出内容将数据放入到集合中,准备合并bean对象
for (CustomOrder bean : values) {
if(bean.getTableFlag().equals("1")){//订单表
CustomOrder orderBean = new CustomOrder();
try {
BeanUtils.copyProperties(orderBean,bean);
} catch (IllegalAccessException e) {
e.printStackTrace();
} catch (InvocationTargetException e) {
e.printStackTrace();
}
ordeBeans.add(orderBean);
}else {//顾客表
try {
BeanUtils.copyProperties(cusBean,bean);
} catch (IllegalAccessException e) {
e.printStackTrace();
} catch (InvocationTargetException e) {
e.printStackTrace();
}
}
}
//3.遍历集合,进行空白字段拼接
for (CustomOrder bean : ordeBeans) {
//将顾客对象的姓名取出来填充到list中的对象中去
bean.setCustomName(cusBean.getCustomName());
//4.调用写出方法
context.write(bean,NullWritable.get());
}
}
}
4.编写Driver类
注意:设置输出路径的时候,路径文件夹,必须不能存在!!!
package cn.kgc.kb09.join;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* @Author: ChaoKeAiMuZhi
* @Date: 2020/9/10 15:29
* @Description:
**/
public class CODriver {
public static void main(String[] args) throws Exception{
//1.创建配置文件,创建Job
Configuration cfg=new Configuration();
Job job=Job.getInstance(cfg,"co_job");
//2.设置jar的位置
job.setJarByClass(CODriver.class);
3.设置map和reduce的位置
job.setMapperClass(COMapperJoin.class);
//job.setReducerClass(COReducerJoin.class);
job.setReducerClass(COReduceJoinTest.class);
//4.设置map输出端的key,value类型
job.setMapOutputValueClass(CustomOrder.class);
job.setMapOutputKeyClass(Text.class);
//5.设置reduce输出的key,value类型
job.setOutputKeyClass(CustomOrder.class);
job.setOutputValueClass(NullWritable.class);
//6.设置输入输出路径
FileInputFormat.setInputPaths(job,new Path("file:///E:\\softs\\idea_datas\\ideaProjects\\testhdfs\\data\\custom\\output"));
FileOutputFormat.setOutputPath(job,new Path("file:///F:/test/coResult"));
//7.提交程序运行
boolean result = job.waitForCompletion(true);
System.out.println(result?"执行成功":"执行失败");
System.exit(result?0:1);
}
}
Map端join:
1.编写map类:
package cn.kgc.kb09.map.join;
import cn.kgc.kb09.join.CustomOrder;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;
/**
* @Author: ChaoKeAiMuZhi
* @Date: 2020/9/11 9:06
* @Description:在mapper端进行文件join
**/
public class COJoinMapper extends Mapper<LongWritable, Text,Text, CustomOrder> {
//hashmap存储顾客id顾客姓名
Map<String,String> map=new HashMap();
//对顾客表操作
@Override
protected void setup(Context context) throws IOException{
//获取缓存文件的URI,这里只有一个文件
URI[] cacheFile = context.getCacheFiles();
if(cacheFile!=null && cacheFile.length>0){
//获取文件路径,文件名
String filePath=cacheFile[0].getPath();
FileReader fr=new FileReader(filePath);
BufferedReader br = new BufferedReader(fr);
String line;
//读取文件将第一列和第二列作为map的键和值
while((line=br.readLine())!=null && !"".equals(line)){
String[] columns = line.split(",");
map.put(columns[0],columns[1]);
}
}
}
//对订单表操作
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//获取一行切割成字段
String line = value.toString();
String[] columns = line.split(",");
//准备顾客订单对象
CustomOrder co = new CustomOrder();
String orderId = columns[0];
String orderStatus = columns[3];
String custId = columns[2];
//进行赋值
co.setCustomId(custId);
//从HashMap获取姓名
String custName = map.get(custId);
co.setCustomName(custName);
co.setOrderId(orderId);
co.setOrderStatus(orderStatus);
// //获取没关联到的用户map
// map.remove(custId);
//写出一个个对象(map方法每个键都会执行)
context.write(new Text(custId),co);
}
// @Override
// protected void cleanup(Context context) throws IOException, InterruptedException {
// Set<String> keys = map.keySet();
// for (String key : keys) {
// EmpDept co = new EmpDept();
// co.setCustomId(key);
// co.setCustomName(map.get(key));
// context.write(new Text(key),co);
// }
// }
}
2.编写driver类
当map和reduce输出类型相同的时候,只需要使用setOutputKeyClass和setOutputValueClass来设置输出类型即可
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(CustomOrder.class);
当map和reduce输出类型不一致时,必须使用setMapOutputKeyClass和setMapOutputValueClass来设置map的输出类型,用setOutputKeyClass和setOutputValueClass来设置reduce的输出类型,同时不可以调用setCombinerClass方法。
//4.设置map输出端的key,value类型
job.setMapOutputValueClass(CustomOrder.class);
job.setMapOutputKeyClass(Text.class);
//5.设置reduce输出的key,value类型
job.setOutputKeyClass(CustomOrder.class);
job.setOutputValueClass(NullWritable.class);
注意:设置输出路径的时候,路径文件夹,必须不能存在!!!
package cn.kgc.kb09.map.join;
import cn.kgc.kb09.join.CustomOrder;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.net.URI;
/**
* @Author: ChaoKeAiMuZhi
* @Date: 2020/9/11 9:48
* @Description:map join对应的driver
**/
public class COJoinDriver {
public static void main(String[] args) throws Exception{
//1.创建配置文件,创建Job
Job job=Job.getInstance(new Configuration(),"mapJoinJob");
//2.设置jar的位置
job.setJarByClass(COJoinDriver.class);
//3.设置map和reduce的位置(这里不需要reduce)
job.setMapperClass(COJoinMapper.class);
//4.设置map输出端的key,value类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(CustomOrder.class);
//5.设置reduce输出的key,value类型(这里不需要)
//6.设置输出路径
//注意URI无法识别\\只能用///不然会报错,无法识别路径
String inPath="file:///E:/softs/idea_datas/ideaProjects/testhdfs/data/custom/output/orders.csv";
String outPath="file:///F:/test/mapJoinRst";
String cachePath="file:///E:/softs/idea_datas/ideaProjects/testhdfs/data/custom/output/customers.csv";
job.addCacheFile(new URI(cachePath));
FileInputFormat.setInputPaths(job,new Path(inPath));
FileOutputFormat.setOutputPath(job,new Path(outPath));
//7.提交程序运行
boolean result = job.waitForCompletion(true);
System.out.println(result?"执行成功":"执行失败");
System.exit(result?0:1);
}
}