结合GroupingComparator的例子来说明
输入源数据
从左到右依次为订单id,商品编号和商品价格
需求:输出每个订单中价格最高的商品
思路:
1.在map之前的排序规则:订单号不同时按字典排序,订单号相同时按价格降序
2.在reduce分组排序时,只要订单号相同就认为是相同的key,遍历迭代器的时候只取第一个
相关代码
OrderBean.java
package MapReduceGroupComparator;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class OrderBean implements WritableComparable<OrderBean> {
private String orderId = new String(); //订单id
private String pId = new String(); //商品id
private double price = 0; //价格
@Override
public int compareTo(OrderBean o) {
OrderBean ob = (OrderBean)o;
//先按订单id比较
if(this.orderId.equals(o.orderId)){
//再按价格倒排
double d = this.price - ob.price;
if(d == 0){
return 0;
}else if(d < 0){
return 1;
}else{
return -1;
}
}else{
return this.orderId.compareTo(ob.orderId);
}
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(orderId);
out.writeDouble(price);
}
@Override
public void readFields(DataInput in) throws IOException {
this.orderId = in.readUTF();
this.price = in.readDouble();
}
public String getOrderId() {
return orderId;
}
public void setOrderId(String orderId) {
this.orderId = orderId;
}
public String getpId() {
return pId;
}
public void setpId(String pId) {
this.pId = pId;
}
public double getPrice() {
return price;
}
public void setPrice(double price) {
this.price = price;
}
@Override
public String toString() {
return orderId + "\t" + price;
}
}
OrderMapper.java
package MapReduceGroupComparator;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class OrderMapper extends Mapper<LongWritable, Text,OrderBean, NullWritable> {
OrderBean ob = new OrderBean();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] fileds = value.toString().split("\t");
ob.setOrderId(fileds[0]);
ob.setpId(fileds[1]);
ob.setPrice(Double.parseDouble(fileds[2]));
context.write(ob,NullWritable.get());
}
}
OrderReducer.java
package MapReduceGroupComparator;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class OrderReducer extends Reducer<OrderBean, NullWritable,OrderBean,NullWritable> {
@Override
protected void reduce(OrderBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
context.write(key,nw);
}
}
}
OrderGroupComparator.java
package MapReduceGroupComparator;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class OrderGroupComparator extends WritableComparator {
protected OrderGroupComparator(){
super(OrderBean.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
OrderBean oba = (OrderBean)a;
OrderBean obb = (OrderBean)b;
return oba.getOrderId().compareTo(obb.getOrderId());
}
}
只要订单号相同就认为是同一个key
OrderDriver.java
package MapReduceGroupComparator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class OrderDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(OrderDriver.class);
job.setMapperClass(OrderMapper.class);
job.setReducerClass(OrderReducer.class);
job.setMapOutputKeyClass(OrderBean.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(OrderBean.class);
job.setOutputValueClass(NullWritable.class);
//设置GroupComparator
job.setGroupingComparatorClass(OrderGroupComparator.class);
FileInputFormat.setInputPaths(job,new Path("F:\\Codes\\JavaCodes\\MapReduceLearning\\testdata\\order.txt"));
FileOutputFormat.setOutputPath(job,new Path("F:\\Codes\\JavaCodes\\MapReduceLearning\\testdata\\output"));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
这样执行的结果是没错的。
但是,如果要改为输出每个订单中价格最高的前两个商品呢?
我尝试把reduce函数改为:
@Override
protected void reduce(OrderBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
int i = 0;
for(NullWritable nw : values){
if(++i < 2){
context.write(key,nw);
}
}
}
结果出乎意料,两次的key都不相同。
疑问:在这里只迭代values,为什么key也会跟着变?
看源码
reducer源码
//通过debug我们可以看到,数据在结束map任务执行reduce任务的时候,reduce端会先调用这个方法,而调用这个
//方法的类是我们实现的reduce类,通过继承调用该方法,然后在该方法里面调用我们实现类重写的reduce方法。
public void run(Context context) throws IOException, InterruptedException {
setup(context);
try {
while (context.nextKey()) {//这个地方调用ReduceContextImpl的方法进行判断
reduce(context.getCurrentKey(), context.getValues(), context);//这个地方调用我们的实现类的reduce方法走我们的逻辑代码了
// If a back up store is used, reset it
Iterator<VALUEIN> iter = context.getValues().iterator();
if(iter instanceof ReduceContext.ValueIterator) {
((ReduceContext.ValueIterator<VALUEIN>)iter).resetBackupStore();
}
}
} finally {
cleanup(context);
}
}
}
ReduceContextImpl部分源码
public class ReduceContextImpl {
private RawKeyValueIterator input;//这个迭代器里面存储的key-value对元素。
private KEYIN key; // current key
private VALUEIN value; // current value
private boolean firstValue = false; // first value in key
private boolean nextKeyIsSame = false; // more w/ this key
private boolean hasMore; // more in file
private ValueIterable iterable = new ValueIterable();//访问自己的内部类
public ReduceContextImpl() throws InterruptedException, IOException{
hasMore = input.next();//对象创建的时候,就先判断reduce接收的key-value迭代器是否有元素,并获取下一个元素
}
/** 创建完成就调用该方法 ,开始处理下一个唯一的key*/
public boolean nextKey() throws IOException,InterruptedException {
while (hasMore && nextKeyIsSame) {
//判断迭代器是否还有下一个元素已经下一个元素是否和上一个已经遍历出来的key-value元素的key是不是一样
nextKeyValue();
}
if (hasMore) {
if (inputKeyCounter != null) {
inputKeyCounter.increment(1);
}
return nextKeyValue();
} else {
return false;
}
}
/**
* Advance to the next key/value pair.
*/
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (!hasMore) {
key = null;
value = null;
return false;
}
firstValue = !nextKeyIsSame;
//获取迭代器下一个元素的key
DataInputBuffer nextKey = input.getKey();
//设置当前key的坐标
currentRawKey.set(nextKey.getData(), nextKey.getPosition(),
nextKey.getLength() - nextKey.getPosition());
buffer.reset(currentRawKey.getBytes(), 0, currentRawKey.getLength());
//反序列化得到当前key对象
key = keyDeserializer.deserialize(key);
//获取迭代器下一个元素的value
DataInputBuffer nextVal = input.getValue();
buffer.reset(nextVal.getData(), nextVal.getPosition(), nextVal.getLength()
- nextVal.getPosition());
//反序列化value
value = valueDeserializer.deserialize(value);
currentKeyLength = nextKey.getLength() - nextKey.getPosition();
currentValueLength = nextVal.getLength() - nextVal.getPosition();
if (isMarked) {
//存储下一个key和value
backupStore.write(nextKey, nextVal);
}
//迭代器向下迭代一次
hasMore = input.next();
//如果还有元素,则进行比较,判断key是否相同
if (hasMore) {
nextKey = input.getKey();
//这个地方也是比较关键的:
nextKeyIsSame = comparator.compare(currentRawKey.getBytes(), 0,
currentRawKey.getLength(),
nextKey.getData(),
nextKey.getPosition(),
nextKey.getLength() - nextKey.getPosition()
) == 0;
} else {
nextKeyIsSame = false;
}
inputValueCounter.increment(1);
return true;
}
//一个迭代器模式的内部类
protected class ValueIterator implements ReduceContext.ValueIterator<VALUEIN> {
private boolean inReset = false;
private boolean clearMarkFlag = false;
@Override//它并不仅仅是判断迭代器是否还有下一个元素,而且还要判断下一个元素和上一个元素是不是相同的key
public boolean hasNext() {
if (inReset && backupStore.hasNext()) {
return true;
}
return firstValue || nextKeyIsSame;
}
@Override
//这个地方要注意了,其实在获取下一个元素的时候主要调用的是nextKeyValue();
public VALUEIN next() {
if (inReset) {
if (backupStore.hasNext()) {
backupStore.next();
DataInputBuffer next = backupStore.nextValue();
buffer.reset(next.getData(), next.getPosition(), next.getLength()
- next.getPosition());
value = valueDeserializer.deserialize(value);
return value;
} else {
inReset = false;
backupStore.exitResetMode();
if (clearMarkFlag) {
clearMarkFlag = false;
isMarked = false;
}
}
}
// if this is the first record, we don't need to advance
if (firstValue) {
firstValue = false;
return value;
}
// otherwise, go to the next key/value pair
nextKeyValue();//该方法就是获取下一个key,value对,key值的变化也就在这里表现出来了。
return value;
}
}
//内部类,实现迭代器,具备迭代器功能
protected class ValueIterable implements Iterable<VALUEIN> {
private ValueIterator iterator = new ValueIterator();
@Override
public Iterator<VALUEIN> iterator() {
return iterator;
}
}
public Iterable<VALUEIN> getValues() throws IOException, InterruptedException {
return iterable;
}
}
总结
ReduceContextImpl类的RawKeyValueIterator input迭代器对象里面存储中着key-value对的元素, 以及一个只存储value的迭代器,然后每调一次我们实现的reduce方法,就是传入ValueIterable迭代器对象和当前的key。但是我们在方法里面调用迭代器的next方法时,其实调用了nextKeyValue,来获取下一个key和value,并判断下一个key是否和 上一个key是否相同,然后决定hashNext方法是否结束,同时对key进行了一次重新赋值。
这个方法获取KV的迭代器的下一个KV值,然后把K值和V值放到之前传入我们自己写的Reduce类的方法中哪个输入参数的地址上,白话说:框架调用我们写的reduce方法时,传入了三个参数,然后我们方法内部调用phoneNbrs.hashNext方法就是调用的ReduceContextImpl的内部类ValueIterator的hashNext方法,这个方法里面调用了ReduceContextImpl内的nextKeyValue方法,该方法内部又清除了之前调用用户自定义reduce方法时传入的k,v参数的内存地址的数据,然后获取了RawKeyValueIterator input迭代器的下一个KV值,然后把k值和V值放入该数据。这就是原因了。
源代码及分析部分引用:https://www.cnblogs.com/intsmaze/p/6737337.html
感谢前辈的分享!