使用mapReducer实现数据合并
订单表order.txt 与 product.txt表数据合并
基础数据
1.order.txt
order_id | product_id | product_amount |
---|---|---|
1001 | 01 | 1 |
1002 | 02 | 2 |
1003 | 03 | 3 |
order.txt
1001 01 1
1002 02 2
1003 03 3
1001 01 1
1002 02 2
1003 03 3
2.product.txt
01 小米
02 华为
03 格力
TableBean
。
package
com.cevent.hadoop.mapreduce.table;
import
java.io.DataInput;
import
java.io.DataOutput;
import
java.io.IOException;
import
org.apache.hadoop.io.Writable;
public class TableBean implements Writable{
private String order_id; //订单id
private String product_id; //产品id
private int product_amount; //产品数量
private String product_name;//产品名称
private String order_flag; //表单标记
public String getOrder_id() {
return order_id;
}
public void setOrder_id(String order_id) {
this.order_id = order_id;
}
public String getProduct_id() {
return product_id;
}
public void setProduct_id(String product_id) {
this.product_id = product_id;
}
public int getProduct_amount() {
return product_amount;
}
public void setProduct_amount(int product_amount) {
this.product_amount = product_amount;
}
public String getProduct_name() {
return product_name;
}
public void setProduct_name(String product_name) {
this.product_name = product_name;
}
public String getOrder_flag() {
return order_flag;
}
public void setOrder_flag(String order_flag) {
this.order_flag = order_flag;
}
public TableBean() {
super();
}
public TableBean(String order_id, String product_id, int product_amount,
String product_name, String order_flag) {
super();
this.order_id = order_id;
this.product_id = product_id;
this.product_amount = product_amount;
this.product_name = product_name;
this.order_flag = order_flag;
}
@Override
public String toString() {
return order_id + "\t" + product_id+ "\t" + product_amount+"\t"+product_name;
}
//序列化
@Override
public void write(DataOutput output) throws
IOException {
// 写入
output.writeUTF(order_id);
output.writeUTF(product_id);
output.writeInt(product_amount);
output.writeUTF(product_name);
output.writeUTF(order_flag);
}
//反序列化
@Override
public void readFields(DataInput input) throws
IOException {
// 读取
this.order_id=input.readUTF();
this.product_id=input.readUTF();
this.product_amount=input.readInt();
this.product_name=input.readUTF();
this.order_flag=input.readUTF();
}
}
TableMapper
。
package
com.cevent.hadoop.mapreduce.table;
import
java.io.IOException;
import
org.apache.hadoop.io.LongWritable;
import
org.apache.hadoop.io.Text;
import
org.apache.hadoop.mapreduce.Mapper;
import
org.apache.hadoop.mapreduce.lib.input.FileSplit;
/**
*
LongWritable, Text, Text, TableBean
* 输入Writable类型 输入格式 product_id bean对象
* @author cevent
* @date 2020年4月12日
*/
public class TableMapper extends Mapper<LongWritable, Text, Text, TableBean>{
TableBean tableBean=new TableBean();
Text keyID=new Text();
@Override
protected void map(LongWritable key, Text value,Context context)
throws
IOException, InterruptedException {
// 1.获取输入文件类型:FileSplit为txt文件的切片类型
FileSplit inputSplit=(FileSplit)context.getInputSplit();
// 1.1获取输入文件的路径和名称
String txtName=inputSplit.getPath().getName();
//2.获取输入数据
String splitLine=value.toString();
//3.不同文件,分别处理
if(txtName.startsWith("order")){ //订单
//3.1开始切割
String [] fields=splitLine.split("\t");
//3.2封装bean对象:1002 02 2
tableBean.setOrder_id(fields[0]);
tableBean.setProduct_id(fields[1]);
tableBean.setProduct_amount(Integer.parseInt(fields[2]));
//3.3封装bean的product表默认值
tableBean.setProduct_name("");
tableBean.setOrder_flag("0"); //reducer读取区分表
//3.4设置key索引=priduct_id
keyID.set(fields[1]);
}else{ //产品
String [] fields=splitLine.split("\t");
tableBean.setOrder_id("");
tableBean.setProduct_id(fields[0]);
tableBean.setProduct_amount(0);
tableBean.setProduct_name(fields[1]);
tableBean.setOrder_flag("1"); //reducer读取区分表
keyID.set(fields[0]);
}
//4.封装bean对象输出
context.write(keyID, tableBean);
}
}
TableReducer
。
package
com.cevent.hadoop.mapreduce.table;
import
java.io.IOException;
import
java.util.ArrayList;
import
org.apache.commons.beanutils.BeanUtils;
import
org.apache.hadoop.io.NullWritable;
import
org.apache.hadoop.io.Text;
import
org.apache.hadoop.mapreduce.Reducer;
/**
*
Reducer的输入=mapper的输出,Reducer的输出key=输入的value,Reducer的输出value=写入的类型
*
Text, TableBean, TableBean, NullWritable
*
mapperOutKey mapperOutValue
* @author cevent
* @date 2020年4月12日
*/
public class TableReducer extends Reducer<Text, TableBean, TableBean, NullWritable>{
@Override
protected void reduce(Text key, Iterable<TableBean> values,Context context)
throws
IOException, InterruptedException {
// 1.缓存区分:order和product表的数据来源
TableBean productBean=new TableBean();
//作为遍历bean对象的缓存集合
ArrayList<TableBean> orderBeans=new ArrayList<>();
//1.1循环遍历values
for(TableBean bean:values){
//order_flag=0
product_flag=1
if("0".equals(bean.getOrder_flag())){
//订单处理:1002 02 2 ----
1003 03 3
TableBean orderBean=new TableBean();
try {
//2.拷贝order缓存数据
BeanUtils.copyProperties(orderBean, bean);
//3.将缓存的数据添加到beans集合
orderBeans.add(orderBean);
}
catch (Exception e) {
e.printStackTrace();
}
}else{
//产品处理:02 华为
//直接获取product_name容易数据混淆
//productBean.setProduct_name(productBean.getProduct_name());
try {
//调用BeanUtils区分,断开缓存类和实体类的连接
BeanUtils.copyProperties(productBean, bean);
}
catch (Exception e) {
e.printStackTrace();
}
}
}
//数据拼接
for(TableBean bean:orderBeans){
//更新产品名称字段
bean.setProduct_name(productBean.getProduct_name());
//写出
context.write(bean, NullWritable.get());
}
}
}
TableDriver
。
package
com.cevent.hadoop.mapreduce.table;
import java.io.IOException;
import
org.apache.hadoop.conf.Configuration;
import
org.apache.hadoop.fs.Path;
import
org.apache.hadoop.io.NullWritable;
import
org.apache.hadoop.io.Text;
import
org.apache.hadoop.mapreduce.Job;
import
org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import
org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* 整合mapper和reducer
* @author cevent
* @date 2020年4月12日
*/
public class TableDriver {
public static void main(String[] args) throws Exception
{
//1.获取配置信息及job实例
Configuration configuration=new Configuration();
Job job=Job.getInstance(configuration);
//2.指定jar包本地路径
job.setJarByClass(TableBean.class);
//3.指定job使用的mapper和reducer
job.setMapperClass(TableMapper.class);
job.setReducerClass(TableReducer.class);
//4.指定mapper的输出key/value类型:mapper的输出=reducer的输入
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(TableBean.class);
//5.指定输出的key和value类型=reducer的输出
job.setOutputKeyClass(TableBean.class);
job.setOutputValueClass(NullWritable.class);
//6.指定job的输入原始文件目录
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//7.将job中配置及jar提交yarn处理
boolean result=job.waitForCompletion(true);
System.exit(result?0:1);
}
}
run实现
。
输入:D:\xxx\eclipse_code\hadoopTMP\inputOrderProduct
输出:D:\xxx\eclipse_code\hadoopTMP\outputOrderProduct
9. 修改TableBean
。
@Override
public String toString() {
return order_id + "\t" + product_id+ "\t" + product_amount+"\t";
}
修改TableReducer
。
//数据拼接
for(TableBean bean:orderBeans){
//更新产品名称字段
bean.setProduct_id(productBean.getProduct_name());
//写出
context.write(bean, NullWritable.get());
}
实现product_id→product_name转换
。