TopN案例
目的:统计数据中的数字排行Top5
输入数据:
# donghan
lvbu 100
yuanshao 80
yanliang 94
wenchou 95
huaxiong 89
mateng 85
guosi 91
lijue 92
# wei
dianwei 97
xuchu 96
xiahoudun 93
xiahouyuan 92
zhangliao 91
caocao 85
wenyuan 95
# shu
gaunyu 97
zhangfei 99
zhaoyun 98
machao 98
huangzhong 85
weiyan 93
liubei 80
# wu
ganning 95
zhouyu 90
luxun 75
sunce 96
输出数据按照数字从大到小的Top5
Bean
public class TopNBean implements WritableComparable<TopNBean> {
private String name;
private int num;
public TopNBean () {
super();
}
@Override
public int compareTo (TopNBean topNBean) {
// 如果是0会认为是同一个,因此还需要名字进行排序
if (num > topNBean.getNum()) {
return -1;
} else if (num < topNBean.getNum()) {
return 1;
} else {
return name.compareToIgnoreCase(topNBean.name);
}
}
@Override
public void write (DataOutput out) throws IOException {
out.writeUTF(name);
out.writeInt(num);
}
@Override
public void readFields (DataInput in) throws IOException {
name = in.readUTF();
num = in.readInt();
}
@Override
public String toString () {
return name + "\t" + num;
}
...
}
Mapper
public class TopNMapper extends Mapper<LongWritable, Text, TopNBean, NullWritable> {
// 定义一个TreeMap作为存储数据的容器(天然按key排序),这里的key只有一个排序,之后可能会有多个排序字段
TreeMap<TopNBean, String> treeMap = new TreeMap<>();
@Override
protected void map (LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] fields = line.split(" ");
TopNBean topNBean = new TopNBean();
topNBean.setName(fields[0]);
topNBean.setNum(Integer.parseInt(fields[1]));
treeMap.put(topNBean, topNBean.getName());
// 限制TreeMap的数据量,超过5条就删除掉最小的一条数据
if (treeMap.size() > 5) {
treeMap.remove(treeMap.lastKey());
}
}
@Override
protected void cleanup (Context context) throws IOException, InterruptedException {
// 所有数据处理完将会执行cleanup
for (Map.Entry<TopNBean, String> entry : treeMap.entrySet()) {
context.write(entry.getKey(), NullWritable.get());
}
}
}
Reducer
public class TopNReducer extends Reducer<TopNBean, NullWritable, TopNBean, NullWritable> {
TreeMap<TopNBean, String> treeMap = new TreeMap<>();
@Override
protected void reduce (TopNBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
for (NullWritable value : values) {
// 这里一定要在for循环里面定义Bean对象
TopNBean topNBean = new TopNBean();
topNBean.setNum(key.getNum());
topNBean.setName(key.getName());
treeMap.put(topNBean, topNBean.getName());
// 限制TreeMap的数据量,超过5条就删除掉最小的一条数据
if (treeMap.size() > 5) {
treeMap.remove(treeMap.lastKey());
}
}
}
@Override
protected void cleanup (Context context) throws IOException, InterruptedException {
// 所有数据处理完将会执行cleanup
for (Map.Entry<TopNBean, String> entry : treeMap.entrySet()) {
context.write(entry.getKey(), NullWritable.get());
}
}
}
输出结果如下:
lvbu 100
zhangfei 99
machao 98
zhaoyun 98
dianwei 97