查询所有记录中搜索频次最高的30个关键词。
主要分两个步骤,首先多个mapper分别处理所有数据中的一部分关键词数据,然后汇总到reducer做词频统计。
CountWordMapper
在Mapper中处理每一小块数据,使用HashMap存储关键字及其频次,可以节省时间,key为查询的关键字。Mapper返回一个<Text , LongWritable>的列表,存储当前文件块中的关键字及其频次,传给reducer作统计。
CountWordReducer
Reducer将所有mapper得到的关键字及频次汇总,不同mapper下的相同关键字在此合并,可得到当前关键字的总频次。为了得到TopK数据,在reducer维护一个大小为K的小顶堆,每次得到一个关键词的搜索总频次即向堆中插入一个Pair<String, Long>,堆中元素排序自定义为关键词频次Long。当堆元素大于K时,将堆顶元素(即当前最小元素)删去。最后reducer可得到访问频次最大的TopK关键词。输出前将堆中元素按频次排序即可。
词频统计完整代码:
Main.java
public class Main {
public static void main(String[] args) throws Exception {
countWords(); //统计词频前30的搜索关键词
countUrls(); //被访问次数前10的网址及其次数占比
}
private static void countWords() throws Exception {
String input_dir = "./data/sogou.full.utf8";//input
String outputDir = "./result/words";//output
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
fs.deleteOnExit(new Path(outputDir));
fs.close();
Job job = new Job(conf, "CountWords");
job.setMapperClass(CountWordMapper.class);
job.setReducerClass(CountWordReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.setInputPaths(job, new Path(input_dir));
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job, new Path(outputDir));
job.waitForCompletion(true);
}
}
CountWordMapper.java
public class CountWordMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
//使用hash表存储关键词和该词的频次
HashMap<String, Long> map = new HashMap<>();
@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//分离各项数据,以‘\t’为间隔标志
String fields[] = value.toString().split("\t");
if (fields.length != 6) {
return;
}
String keyWord = fields[2];
long count=map.getOrDefault(keyWord,-1L);
if (count==-1L)//判断该词是否已存在于hash表中
map.put(keyWord,1L);//不存在,加入新词
else
map.replace(keyWord,count+1);//存在,词频加一
}
@Override
protected void cleanup(Mapper<LongWritable, Text, Text , LongWritable>.Context context) throws IOException, InterruptedException {
//将当前文件块内关键词的频度输出给reducer
for (String keyWord : map.keySet()) {
context.write(new Text(keyWord), new LongWritable(map.get(keyWord)));
}
}
}
CountWordReducer.java
public class CountWordReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
public static int K = 30;//选出频次最大的K条关键词
//小顶堆,容量K,用于快速删除词频最小的元素
PriorityQueue<Pair<String, Long>> minHeap = new PriorityQueue<>((p1, p2) -> (int) (p1.getValue() - p2.getValue()));
//每次传入的参数为key相同的values的集合
public void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
long total = 0;
for (LongWritable count : values) {
//依次取出每个mapper统计的关键词key的频次,加起来
total += count.get();
}
Pair<String, Long> tmp = new Pair<>(key.toString(), total);
minHeap.add(tmp);//向小顶堆插入新的关键词词频
if (minHeap.size() > K)//若小顶堆容量达到要求的上限
minHeap.poll();//删除堆顶最小的元素,保持TopK
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
List<Pair<String, Long>> list = new ArrayList<>();
//从小顶堆中取出数据,便于排序
for (Pair<String, Long> p : minHeap)
list.add(p);
//对搜索词频前K个元素排序
Collections.sort(list, ((p1, p2) -> (int) (p2.getValue() - p1.getValue())));
//reducer的输出,按搜索词频排好序的TopK关键词
for (Pair<String, Long> t : list)
context.write(new Text(t.getKey()), new LongWritable(t.getValue()));
}
}