有两个文件
pd.txt
01 mac
02 huawei
03 xiaomi
第一列是订单id,第二列是商品名称
order.txt
201801 01 1
201802 02 2
201803 03 3
201804 01 4
201805 02 5
201806 03 6
第一列是时间戳,第二列是订单id,第三列是数量
这两个文件都在一个文件中
需要处理后输出结果是
订单id 商品名称 数量
也就是在map阶段都会读取这两个文件
1)这里先定义个实体类TableBean.class
实体类中包含所有的属性,另加一个字段flag用来判断是订单表和商品表
public class TableBean implements Writable{
//封装对应字段
private String order_id;//订单id
private String pid;//产品id
private int amount;//产品数量
private String pname;//产品名称
private String flag;//判断是订单还是商品
public TableBean() {
super();
}
public String getOrder_id() {
return order_id;
}
public void setOrder_id(String order_id) {
this.order_id = order_id;
}
public String getPid() {
return pid;
}
public void setPid(String pid) {
this.pid = pid;
}
public int getAmount() {
return amount;
}
public void setAmount(int amount) {
this.amount = amount;
}
public String getPname() {
return pname;
}
public void setPname(String pname) {
this.pname = pname;
}
public String getFlag() {
return flag;
}
public void setFlag(String flag) {
this.flag = flag;
}
@Override
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
order_id = in.readUTF();
pid = in.readUTF();
amount = in.readInt();
pname = in.readUTF();
flag = in.readUTF();
}
@Override
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
out.writeUTF(order_id);
out.writeUTF(pid);
out.writeInt(amount);
out.writeUTF(pname);
out.writeUTF(flag);
}
@Override
protected Object clone() throws CloneNotSupportedException {
// TODO Auto-generated method stub
return super.clone();
}
@Override
public boolean equals(Object arg0) {
// TODO Auto-generated method stub
return super.equals(arg0);
}
@Override
protected void finalize() throws Throwable {
// TODO Auto-generated method stub
super.finalize();
}
@Override
public int hashCode() {
// TODO Auto-generated method stub
return super.hashCode();
}
@Override
public String toString() {
// TODO Auto-generated method stub
return order_id+ "\t" + pname + "\t" + amount;
}
}
2)定义mapper类
将读取的每行数据封装成TableBean对象,并且将订单id设置成key,这样相同的订单id的商品数据和订单数据都会是同一个key,当成一组传递到reduce阶段。
public class TableMapper extends Mapper<LongWritable,Text,Text,TableBean>{
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
TableBean v = new TableBean();
Text k = new Text();
//区分两张表
FileSplit split = (FileSplit)context.getInputSplit();
String name = split.getPath().getName();
//获取数据
String line = value.toString();
//区分
if(name.contains("order.txt")) {
//拆分字段
String[] fields = line.split("\t");
//封装对象
v.setOrder_id(fields[0]);
v.setPid(fields[1]);
v.setAmount(Integer.parseInt(fields[2]));
v.setPname("");
v.setFlag("0");
//设置k 商品id作为k
k.set(fields[1]);
}else {//此时为商品表
//切分字段
String[] fields = line.split("\t");
//封装对象
v.setOrder_id("");
v.setPid(fields[0]);
v.setAmount(0);
v.setPname(fields[1]);
v.setFlag("1");
//设置k 商品id
k.set(fields[0]);
}
context.write(k, v);
}
}
3)定义reduce类
传递到reduce阶段的values中包括相同订单id的多条订单数据和商品数据,需要将商品数据的商品名称添加到订单数据的商品名称中。
public class TableReducer extends Reducer<Text,TableBean,TableBean,NullWritable>{
@Override
protected void reduce(Text key, Iterable<TableBean> values,
Context context) throws IOException, InterruptedException {
//创建集合容器获取订单数据
ArrayList<TableBean> orderbean = new ArrayList<>();
//商品存储
TableBean pdBean= new TableBean();//把pd中商品名拷贝到orderben
TableBean ob1=null;
for(TableBean v : values) {
if("0".equals(v.getFlag())) {//订单数据
try {
TableBean ob = new TableBean();
BeanUtils.copyProperties(ob, v);
orderbean.add(ob);
} catch (IllegalAccessException | InvocationTargetException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}else {//商品数据
ob1 = new TableBean();
try {
BeanUtils.copyProperties(ob1, v);
} catch (IllegalAccessException | InvocationTargetException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
for(TableBean v : orderbean) {//将订单数据的商品名属性修改成对应的商品数据的商品名
v.setPname(ob1.getPname());
context.write(v, NullWritable.get());
}
}
}
4)定义driver驱动类
public class TableDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//1.获取job任务
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
//2.设置driver类
job.setJarByClass(TableDriver.class);
//3.设置mapper和reducer类
job.setMapperClass(TableMapper.class);
job.setReducerClass(TableReducer.class);
//4.设置mapper输出数据类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(TableBean.class);
//5.设置reduce输出数据类型
job.setOutputKeyClass(TableBean.class);
job.setOutputValueClass(NullWritable.class);
//6.设置输入存在的路径和处理后的结果路径
FileInputFormat.setInputPaths(job, new Path("e:/bigdata/in0113"));
FileOutputFormat.setOutputPath(job, new Path("e:/bigdata/out0113"));
//7.提交任务
boolean rs = job.waitForCompletion(true);
System.out.println(rs?0:-1);
}
}
在map阶段将需要一同处理的数据设置相同的key,在reduce阶段处理。