背景
有两个文件customer.txt,orders.txt分别记录着客户的信息和客户的订单信息。我们要实现数据库中的join操作,类似于“select a.*,b.* from A a,B b where a.cid = b.cid;”。
//customer.txt中记录的是客户信息
//客户id,名字,年龄
//cid,name,age
1,tom1,12
2,tom2,13
3,tom3,14
4,tom4,15
//orders.txt中记录的是客户订单信息
//订单id,价格,客户id
//oid,price,cid
1,12.23,1
2,12.48,1
3,12.23,2
4,12.56,2
5,12.23,2
6,15.55,3
7,16.23,3
8,18.78,3
9,20.23,3
思路
我们的map方法要读取两个文件中的内容,一行行读取文件中的值去构造自定义组合key,发往reduce进行排序聚合。发往reduce function的每一组数据的顺序是第一行是客户的信息,剩下的是他的订单信息。
我们要对两个文件中的cid对numPartition取模,这样从customer.txt中读取的客户信息,从orders.txt中读取的订单信息都会在一个分区里面,会发往同一个reduce节点处理。
我们对同一个reduce节点中的按照cid进行分组处理,这样某一个客户的信息和他的订单信息都会由一个reduce函数处理,然后循环遍历完成拼接。
实现
自定义组合key,完成从customer.txt,orders.txt读取内容。
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* 客户和他的订单组合key
* 一个客户有多个订单
* 组合key要实现序列化和比较器接口
*/
public class CustomerOrdersComboKey implements WritableComparable<CustomerOrdersComboKey> {
//客户id
private int cid;
//订单id
private int oid;
/**
*由于我们读取customer.txt和orders.txt文件,来生成组合key对象
* 不同的文件生成的组合key对象有不同的类型
* type == 0 表示此组合key是客户信息
* type == 1 表示此组合key是订单信息
*/
private int type;
//客户信息字符串
private String customerInfo = "";
//订单信息字符串
private String orderInfo = "";
/**
* 两个组合key排序比较方法
* 判断两个对象的cid,判断他俩是不是同一个客户的"信息①"(或者是customer.txt中读取的客户信息,或者是orders.txt读取的订单信息)
* 如果cid相同,判断是不是同一类型type,是同一类型按照订单oid排序;如果不是同一类型type,如果type==0返回-1,如果type==1返回1
* 如果cid不同,按照oid升序排序
*/
public int compareTo(CustomerOrdersComboKey key2) {
int cid2 = key2.getCid();
int oid2 = key2.getOid();
int type2 = key2.getType();
if(cid == cid2){ //判断他俩是不是同一个客户的"信息①",判断客户id是否相同
if(type == type2){
return oid - oid2; //按照订单oid升序
}else{
if(type == 0){
return -1;
}else{
return 1;
}
}
}else{
return cid - cid2;
}
}
//序列化
public void write(DataOutput out) throws IOException {
out.writeInt(cid);
out.writeInt(oid);
out.writeInt(type);
out.writeUTF(customerInfo);
out.writeUTF(orderInfo);
}
//序列化
public void readFields(DataInput in) throws IOException {
this.cid = in.readInt();
this.oid = in.readInt();
this.type = in.readInt();
this.customerInfo = in.readUTF();
this.orderInfo = in.readUTF();
}
public int getCid() {
return cid;
}
public void setCid(int cid) {
this.cid = cid;
}
public int getOid() {
return oid;
}
public void setOid(int oid) {
this.oid = oid;
}
public int getType() {
return type;
}
public void setType(int type) {
this.type = type;
}
public String getCustomerInfo() {
return customerInfo;
}
public void setCustomerInfo(String customerInfo) {
this.customerInfo = customerInfo;
}
public String getOrderInfo() {
return orderInfo;
}
public void setOrderInfo(String orderInfo) {
this.orderInfo = orderInfo;
}
}
自定义分区类,相同cid的客户信息和他的订单信息都发往同一分区
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;
/**
* 按照客户cid分区,相同的客户信息和他的订单信息到同一分区中
*/
public class PartionByCid extends Partitioner<CustomerOrdersComboKey,NullWritable> {
public int getPartition(CustomerOrdersComboKey customerOrdersComboKey, NullWritable nullWritable, int numPartitions) {
return customerOrdersComboKey.getCid() % numPartitions;
}
}
组合key的排序对比器
根据客户cid,订单oid,类型type排序,调用组合key中的compareTo方法
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
/**
* 组合key的排序对比器
*/
public class ComparatorOfComboKey extends WritableComparator {
protected ComparatorOfComboKey() {
super(CustomerOrdersComboKey.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
CustomerOrdersComboKey key1 = (CustomerOrdersComboKey)a;
CustomerOrdersComboKey key2 = (CustomerOrdersComboKey)b;
return key1.compareTo(key2);
}
}
reduce节点的分组对比器
相同cid的组合key由同一个reduce函数处理
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
/**
* 组对比器
*/
public class GroupComparator extends WritableComparator {
protected GroupComparator() {
super(CustomerOrdersComboKey.class,true);
}
//组按照cid分组,相同客户id的数据在一组中
@Override
public int compare(WritableComparable a, WritableComparable b) {
CustomerOrdersComboKey key1 = (CustomerOrdersComboKey)a;
CustomerOrdersComboKey key2 = (CustomerOrdersComboKey)b;
return key1.getCid() - key2.getCid();
}
}
mapper处理文件
mapper方法读取文件,生成组合key对象,发往reduce节点
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
public class Mapper extends org.apache.hadoop.mapreduce.Mapper<LongWritable,Text,CustomerOrdersComboKey,NullWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
FileSplit fileSplit = (FileSplit)context.getInputSplit();
String filePath = fileSplit.getPath().toString();
CustomerOrdersComboKey keyOut = new CustomerOrdersComboKey();
//value为txt文件中的一行
String line = value.toString();
if(filePath.contains("customer")){ //如果是客户信息文件
String cid = line.substring(0,line.indexOf(",")); //第一个逗号之前存放的是客户cid
String customerInfo = line;
keyOut.setType(0);
keyOut.setCid(Integer.parseInt(cid));
keyOut.setCustomerInfo(customerInfo);
}else{
String cid = line.substring(line.lastIndexOf(",")+1); //最后一个逗号之后存放的是客户cid
String oid = line.substring(0,line.indexOf(",")); //第一个逗号之前是订单oid
String orderInfo = line;
keyOut.setType(1);
keyOut.setCid(Integer.parseInt(cid));
keyOut.setOid(Integer.parseInt(oid));
keyOut.setOrderInfo(orderInfo);
}
context.write(keyOut,NullWritable.get());
}
}
reduce对发过来的一组key,value进行处理
每个reduce方法处理一组KV,根据排序和分组规则,第一条是某一客户的信息,从第二条到最后一条是这个客户的订单信息。遍历value,取出它们各自对应的自定义组合key,完成信息的拼接。
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import java.io.IOException;
import java.util.Iterator;
public class Reducer extends org.apache.hadoop.mapreduce.Reducer<CustomerOrdersComboKey,NullWritable,Text,NullWritable> {
@Override
protected void reduce(CustomerOrdersComboKey key, Iterable<NullWritable> values, Context context)
throws IOException, InterruptedException {
Iterator<NullWritable> iterator = values.iterator();
iterator.next();
if(0 == key.getType()){ //第一条是客户的信息
String customerInfo = key.getCustomerInfo();
int cid = key.getCid();
while(iterator.hasNext()){ //他下面有他的订单信息
iterator.next();
int cid_order = key.getCid();
if(cid_order == cid && 1 == key.getType()){
String orderInfo = key.getOrderInfo();
context.write(new Text(customerInfo +" ; "+ orderInfo),NullWritable.get());
}
}
}
}
}
app运行类
整个job的运行配置信息
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* 运行启动类
*/
public class App {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("fs.defaultFS","file:///");
Job job = Job.getInstance(conf);
job.setJobName("join 操作");
job.setJarByClass(App.class);
job.setMapperClass(Mapper.class);
job.setReducerClass(Reducer.class);
job.setMapOutputKeyClass(CustomerOrdersComboKey.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.setNumReduceTasks(2);
job.setSortComparatorClass(ComparatorOfComboKey.class);
job.setGroupingComparatorClass(GroupComparator.class);
job.setPartitionerClass(PartionByCid.class);
FileInputFormat.setInputPaths(job,new Path("/home/hadoop/join"));
FileOutputFormat.setOutputPath(job,new Path("/home/hadoop/join/out"));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
我们要的结果
查看part-r-00000里面记录的时候客户cid=2的客户和他的订单信息
查看part-r-00001中客户cid=1和cid=3的客户和他的订单信息