需求:
假如数据量巨大,两表的数据是以文件的形式存储在hdfs中,需要MapReduce程序来实现以下SQL查询运算
select a.id,a.date,b.name.b.category_id,b.price
from
t_ordet a left join t_product b on a.pid = b.id
商品表: id pname category_id price p0001 小米5 1000 2000 p0002 锤子T1 1000 3000 ------------------------------------------ 订单表: id date pid amount 1001 20150710 p0001 2 1002 20150710 p0002 3
Map程序:
public class joinMap extends Mapper<LongWritable, Text, Text, Text> {
//map方法将K1,V1转成K2,V2
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//1、判断数据来自哪个文件
FileSplit fileSplit = (FileSplit) context.getInputSplit();
String fileName = fileSplit.getPath().getName();
if (fileName.equals("product.txt")) {
//判断数据来自商品表
String[] split = value.toString().split(",");
String pId = split[0];//K2商品id
context.write(new Text(pId), value);
} else {
//判断数据来自订单表
String[] split = value.toString().split(",");
String pId = split[2];//K2商品id
context.write(new Text(pId), value);
}
}
}
Reduce程序:
/*
K2 :商品id (p0001)
V2:行文本数据(V1)集合
-------------------
K3:商品id (p0001)
V3:商品表和订单表数据的拼接 p0001,小米5,1000,2000 1001,20150710,p0001,2
*/
public class joinReduce extends Reducer<Text, Text, Text, Text> {
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
//K3为商品id,即K2的值;V3通过遍历V2集合,然后进行拼接字符串。
//1、遍历V2集合,拿到V3
String first = "";//拿到的第一个商品表的数据
String second = "";//拿到的第二个订单表的数据
for (Text value : values) {
if (value.toString().startsWith("p")) {
first = value.toString();
} else {
second += value.toString();
}
}
context.write(key, new Text(first + "\t" + second));
}
}
Job程序:
public class joinJob extends Configured implements Tool {
@Override
public int run(String[] strings) throws Exception {
Configuration conf = getConf();
String input = conf.get("input");
String output = conf.get("output");
//1,获取job对象
Job job = Job.getInstance(conf,"reduce_join");
job.setJarByClass(getClass());
//2,设置job任务
//2.1设置输入类和输入路径
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path(input));
//2.2map程序设置
job.setMapperClass(joinMap.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
//2.3reduce程序设置
job.setReducerClass(joinReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//2.4设置输出类和路径
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job,new Path(output));
//3,等待job任务结束
return job.waitForCompletion(true)?0:1;
}
public static void main(String[] args) throws Exception {
//启动job任务
int run = ToolRunner.run(new joinJob(), args);
System.exit(run);
}
}