Hadoop-mapreduce案例-两表join

13 篇文章 0 订阅
8 篇文章 0 订阅

订单数据表t_order:

id

date

pid

amount

1001

20150710

P0001

2

1002

20150710

P0001

3

1002

20150710

P0002

3

抽象成文件数据:
1001,20150710,P0001,2
1002,20150710,P0001,3
1003,20150710,P0002,3
1002,20150710,P0003,3
1002,20150710,P0002,4


商品信息表t_product:

id

pname

category_id

price

P0001

小米5

1000

2

P0002

锤子T1

1000

3

抽象成文件数据:
p0001,小米,1000,2
p0002,魅族,1001,3
p0003,oppo,1002,3

1、需求:
假如数据量巨大,两表的数据是以文件的形式存储在HDFS中,需要用mapreduce程序来实现一下SQL查询运算:

select a.id,a.date,b.name,b.category_id,b.price from t_order a join t_product b on a.pid = b.id

2、实现机制:
通过将关联的条件作为map输出的key,将两表满足join条件的数据并携带数据所来源的文件信息,发往同一个reduce task,在reduce中进行数据的串联

3、代码实现:
自定义bean:按关联后的数据格式设置字段

public class InfoBean implements Writable{

	private int order_id;
	private String dateString;
	private String p_id;
	private int amount;
	private String pname;
	private int category_id;
	private float price;
	private String flag;//新加字段属性,0代表订单表,1代表产品表
		
	public InfoBean() {}
	public void set(int order_id, String dateString, String p_id, int amount,
			String pname, int category_id, float price,String flag) {
		this.order_id = order_id;
		this.dateString = dateString;
		this.p_id = p_id;
		this.amount = amount;
		this.pname = pname;
		this.category_id = category_id;
		this.price = price;
		this.flag=flag;
	}

	@Override
	public void write(DataOutput out) throws IOException {
		out.writeInt(order_id);
		out.writeUTF(dateString);
		out.writeUTF(p_id);
		out.writeInt(amount);
		out.writeUTF(pname);
		out.writeInt(category_id);
		out.writeFloat(price);
		out.writeUTF(flag);
	}
	@Override
	public void readFields(DataInput in) throws IOException {
		this.order_id=in.readInt();
		this.dateString=in.readUTF();
		this.p_id=in.readUTF();
		this.amount=in.readInt();
		this.pname=in.readUTF();
		this.category_id=in.readInt();
		this.price=in.readFloat();
		this.flag=in.readUTF();
	}

	@Override
	public String toString() {
		return "order_id=" + order_id + ", dateString=" + dateString
				+ ", p_id=" + p_id + ", amount=" + amount + ", pname=" + pname
				+ ", category_id=" + category_id + ", price=" + price
				+ ", flag=" + flag;
	}
	//get,set方法略

MapReduce代码

package join;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class TestJoin {

	static class JoinMapper extends Mapper{
		InfoBean bean=new InfoBean();
		Text k=new Text();
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
             String line = value.toString();
			//获取文件切片
			FileSplit inputSplit = (FileSplit) context.getInputSplit();
			//获取文件名
		    String name = inputSplit.getPath().getName();
			//通过文件名字判断是那种数据
		    String p_id=null;
			if(name.startsWith("order")){
				String[] fields = line.split(",");
			    p_id=fields[2];
			    //后面4各参数订单表里没有,就用默认值
				bean.set(Integer.parseInt(fields[0]), fields[1],p_id , Integer.parseInt(fields[3]), "", 0, 0, "0");
			}else if(name.startsWith("product")){
				String[] fields = line.split(",");
			    p_id=fields[0];
			    //产品表里没有的用默认值
				bean.set(0, "", p_id, 0, fields[1], Integer.parseInt(fields[2]), Float.parseFloat(fields[3]), "1");
			}
			//每读一行,把p_id设为key,bean设为value进行输出
			k.set(p_id);
			context.write(k, bean);
		}
	}
	static class JoinReducer extends Reducer{
		
		@Override
		//< pid, {pbean,obean1,obean2,...}>传进来的每条数据样式,value中1个产品,可以多个订单
		protected void reduce(Text pid, Iterable beans,Context context)
				throws IOException, InterruptedException {
			InfoBean pbean=new InfoBean();//装产品
			List obeans=new ArrayList();//装订单
			for(InfoBean bean:beans){
				//1代表产品
				if("1".equals(bean.getFlag())){
					try {
						BeanUtils.copyProperties(pbean, bean);
					} catch (Exception e) {
						e.printStackTrace();
					}
				}else{
					InfoBean obean = new InfoBean();
					try {
						BeanUtils.copyProperties(obean, bean);
					} catch (Exception e) {
						e.printStackTrace();
					} 
					obeans.add(obean);
				}
				//拼接数据,订单中把产品数据补全
				for(InfoBean b:obeans){
					b.setPname(pbean.getPname());
					b.setCategory_id(pbean.getCategory_id());
					b.setPrice(pbean.getPrice());
					context.write(b, NullWritable.get());
				}
			}
		}
	}
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);
		job.setJarByClass(TestJoin.class);
		job.setMapperClass(JoinMapper.class);
		job.setReducerClass(JoinReducer.class);
		
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(InfoBean.class);
		
		job.setOutputKeyClass(InfoBean.class);
		job.setOutputValueClass(NullWritable.class);
		
		FileInputFormat.setInputPaths(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		boolean res = job.waitForCompletion(true);
		System.exit(res?0:1);	
	}
}

原文:https://www.2cto.com/net/201710/689450.html

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值