背景
类似于sql语句中的join,两张表之间有关系,需要进行联合查询的场景
订单表:
1001 20170710 4 2
1002 20170710 3 100
1003 20170710 2 40
1004 20170711 2 23
1005 20170823 4 55
1006 20170824 3 20
1007 20170825 2 3
1008 20170826 4 23
1009 20170912 2 10
1010 20170913 2 2
1011 20170914 3 14
1012 20170915 3 18
产品表:
1 chuizi 3999
2 huawei 3999
3 xiaomi 2999
4 apple 5999
现在需要统计每一类产品销售金额。
代码
设置分布式缓存存根,然后通过map来存储存根的结果
Driver
当多个文件之间有关系时,设置缓存存根,将小的文件放到缓存当中。
package cn.tedu.join;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.net.URI;
public class JoinDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(JoinDriver.class);
job.setMapperClass(JoinMapper.class);
job.setReducerClass(JoinReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Order.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class);
// 当需要同时处理多个文件的时候,可以将其中比较小的文件放入缓存中,设置一个缓存存根
URI[] files = {URI.create("hdfs://hadoop01:9000/txt/union/product.txt")};
job.setCacheFiles(files);
// 处理大文件
FileInputFormat.addInputPath(job,
new Path("hdfs://hadoop01:9000/txt/union/order.txt"));
FileOutputFormat.setOutputPath(job,
new Path("hdfs://hadoop01:9000/result/join"));
job.waitForCompletion(true);
}
}
model对象
相当于sql查询结果集中的一行数据的字段
package cn.tedu.join;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class Order implements Writable {
private String orderId = "";
private String date = "";
private String proId = "";
private int num;
private String name = "";
private double price;
public String getOrderId() {
return orderId;
}
public void setOrderId(String orderId) {
this.orderId = orderId;
}
public String getDate() {
return date;
}
public void setDate(String date) {
this.date = date;
}
public String getProId() {
return proId;
}
public void setProId(String proId) {
this.proId = proId;
}
public int getNum() {
return num;
}
public void setNum(int num) {
this.num = num;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public double getPrice() {
return price;
}
public void setPrice(double price) {
this.price = price;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(orderId);
out.writeUTF(date);
out.writeUTF(proId);
out.writeInt(num);
out.writeUTF(name);
out.writeDouble(price);
}
@Override
public void readFields(DataInput in) throws IOException {
this.orderId = in.readUTF();
this.date = in.readUTF();
this.proId = in.readUTF();
this.num = in.readInt();
this.name = in.readUTF();
this.price = in.readDouble();
}
}
mapper
map方法调用之前会调用setup进行初始化
通过一个map来存储小表,进行主外键连接
package cn.tedu.join;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;
public class JoinMapper extends Mapper<LongWritable, Text, Text, Order> {
private Map<String, Order> map = new HashMap<>();
// 需要先处理product.txt文件
@Override
protected void setup(Context context) throws IOException, InterruptedException {
// 从缓存中将存储的product.txt地址取出来
URI file = context.getCacheFiles()[0];
// 连接HDFS
FileSystem fs = FileSystem.get(file, context.getConfiguration());
// 打开文件,获取到一个输出流
InputStream in = fs.open(new Path(file.toString()));
// 获取的是字节流,但是数据却是一行一条数据,如果直接使用字节流读取,还需要自己判断什么时候读完一行
// 所以考虑将字节流包装成一个字符流,并且希望能够进行按行读取
// BufferedReader->String LineReader->Text
// 当前场景中,需要将数据读取出来之后还得进行处理
BufferedReader reader = new BufferedReader(new InputStreamReader(in));
// 读取数据
String line;
while ((line = reader.readLine()) != null) {
// 1 chuizi 3999
String[] arr = line.split(" ");
Order o = new Order();
o.setProId(arr[0]);
o.setName(arr[1]);
o.setPrice(Double.parseDouble(arr[2]));
map.put(o.getProId(), o);
}
// 关流
reader.close();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 1003 20170710 2 40
String[] arr = value.toString().split(" ");
Order o = new Order();
o.setOrderId(arr[0]);
o.setDate(arr[1]);
o.setProId(arr[2]);
o.setNum(Integer.parseInt(arr[3]));
o.setName(map.get(o.getProId()).getName());
o.setPrice(map.get(o.getProId()).getPrice());
context.write(new Text(o.getName()), o);
}
}
reducer
package cn.tedu.join;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class JoinReducer extends Reducer<Text, Order, Text, DoubleWritable> {
@Override
protected void reduce(Text key, Iterable<Order> values, Context context) throws IOException, InterruptedException {
double sum = 0;
for (Order val : values) {
sum += val.getNum() * val.getPrice();
}
context.write(key, new DoubleWritable(sum));
}
}