上一个博客写了统计单词出现的个数,也写了统计单词首字母出现的个数,因为是统计单词首字母出现的个数,这些单词中有很多是重复的,如何对单词去重后,统计单词首字母出现的次数呢? 统计不同单词首字母出现的个数的同时同时统计单词首字母出现的个数。例如:
initials total unique
a 100 20
思路是首先对单词进行分区,这个分区可以不写,默认对键值进行分区。分区之后相同的单词会被分到一起,例如输入是
a head hadoop end hive student end and am a hadoop
分组之后是
a a
head
hadoop hadoop
hive
student
end end
and
am
在第一个job的reduce中统计每个单词出现了多少次
第二个job用第一个job的输出作为mapper的输入。这是数据结果可以如下
a 2
head 1
hadoop 2
hive 1
student 1
end 2
and 1
am 1
在第二个job中对单词首字母进行分区,将相同首字母的单词分到一起。代码可参考对单词进行分区的写法,不同的是把单词的首字母取出来对分区块取余,并且对单词的首字母进行比较排序。
分区之后的结果如下
a 2
and 1
am 1
end 2
head 1
hadoop 2
hive 1
student 1
此时在reducer中对于每个单词,统计不同单词出现的次数就是对values直接累加。单词出现的次数是将单词后的数字加起来。
如果需要统计单词出现的频率,比出现一次的单词有多少个,两次,三次。。
可以使用一个一维数组,初始化为零,使用a[total]++的形式,将每个单词出现的次数落在数组对应的桶内。
代码如下:
mapreduce中默认会对mapper写入磁盘之前按照键值进行分区、排序,这一过程称作shuffle。如果想自己设置根据什么进行排序和分区。需要自己定义Partitioner和WritableComparator。
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Partitioner;
public class GroupByWord extends Configured {
public static class KeyPartitioner extends Partitioner<Text, NullWritable> {
@Override
public int getPartition(Text key, NullWritable value, int i) {
return Math.abs(key.hashCode())% i; //用key的hashcode对分区取余
}
}
public static class GroupComparator extends WritableComparator{
protected GroupComparator() {
super(Text.class, true);
}
@SuppressWarnings("rawtypes")
@Override
public int compare(WritableComparable a, WritableComparable b) {
Text a1 = (Text)a;
Text b1 = (Text) b;
return a1.compareTo(b1);
}
}
}
job1 mapper
package wordcount;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class WordCountMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
Text outKey = new Text();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String[] words = value.toString().split("[ ]+");
for(String word : words) {
outKey.set(word);
context.write(outKey, NullWritable.get());
}
}
}
reduce
package wordcount;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class WordCountReducer extends Reducer<Text, NullWritable, Text, Text> {
Text outValue = new Text();
@Override
protected void reduce(Text key, Iterable<NullWritable> value, Context context)
throws IOException, InterruptedException {
//System.out.println("beigin: "+key);
int count = 0;
for(NullWritable v:value){
System.out.println(key);
count++;
}
outValue.set(","+String.valueOf(count));
//System.out.println("word = " + key + " count = " + count);
context.write(key,outValue);
}
}
job2 mapper
package UniqueWordCount;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class UniqueWordCountMapper extends Mapper<LongWritable, Text, Text, Text> {
Text outKey = new Text();
Text outValue = new Text();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String[] wordValue = value.toString().split(",");
outKey.set(wordValue[0]);
outValue.set(wordValue[1]);
System.out.println("key = " + outKey + " value = " + wordValue[1]);
context.write(outKey, outValue);
}
}
package UniqueWordCount;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Partitioner;
public class GroupByInitials extends Configured {
public static class KeyPartitioner extends Partitioner<Text,Text> {
@Override
public int getPartition(Text key, Text value, int i) {
String initials;
if(key.toString().length()>1) {
initials = key.toString().substring(0,1);
}else {
initials = key.toString();
}
return Math.abs(initials.hashCode())% i;
}
}
public static class GroupComparator extends WritableComparator{
protected GroupComparator() {
super(Text.class, true);
}
@SuppressWarnings("rawtypes")
@Override
public int compare(WritableComparable a, WritableComparable b) {
String a1;
if(a.toString().length() > 1) {
a1 = a.toString().substring(0,1);
}else {
a1 = a.toString();
}
String b1;
if(b.toString().length() > 1) {
b1 = b.toString().substring(0,1);
}else {
b1 = b.toString();
}
return a1.compareTo(b1);
}
}
}
package UniqueWordCount;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class UniqueWordCountReducer extends Reducer<Text, Text, Text, Text> {
Text outKey = new Text();
Text outValue = new Text();
@Override
protected void reduce(Text key, Iterable<Text> value, Context context)
throws IOException, InterruptedException {
//System.out.println("beigin: "+key);
int[] fre = {0,0,0,0,0,0,0,0,0,0};
int count = 0;
int uniqueCount = 0;
for(Text v:value){
count+=Integer.valueOf(v.toString());
uniqueCount++;
if(Integer.valueOf(v.toString()) > 9) {
fre[9]++;
}else {
fre[Integer.valueOf(v.toString())-1]++;
}
}
StringBuffer freValue = new StringBuffer(" ");
for(int i=0;i<10;i++) {
freValue.append(i + 1 + "->" + fre[i] + " ");
}
outValue.set(count + " " +uniqueCount + freValue);
String initials;
if(key.toString().length()>1) {
initials = key.toString().substring(0,1);
}else {
initials = key.toString();
}
outKey.set(initials);
//System.out.println("initials = " + outKey + " count = " + count + " unique = " + uniqueCount + " fre =" + freValue);
context.write(outKey,outValue);
}
}
import UniqueWordCount.GroupByInitials;
import UniqueWordCount.UniqueWordCountMapper;
import UniqueWordCount.UniqueWordCountReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import wordcount.GroupByWord;
import wordcount.WordCountMapper;
import wordcount.WordCountReducer;
import java.io.IOException;
public class WordJob extends Configured implements Tool{
@Override
public int run(String[] args) throws Exception {
//定义每个job的输入输出路径
Path in = new Path(args[0]);
Path out = new Path(args[1]);
Path uniqueOut = new Path(args[2]);
Configuration conf = new Configuration();
conf.set("hbase.client.keyvalue.maxsize", "-1");
conf.setInt("mapreduce.job.jvm.numtasks", -1);
conf.set("mapreduce.map.memory.mb", String.valueOf(4096));
conf.set("mapreduce.map.java.opts", String.format("-Xmx%sm", (int) (0.8 * 4096)));
conf.set("mapreduce.reduce.memory.mb", String.valueOf(4096));
conf.set("mapreduce.reduce.java.opts", String.format("-Xmx%sm", (int) (0.8 * 4096)));
Boolean wordcount = step1(in, out);
if(wordcount) { //第一个job执行完成之后执行第二个job
step2(out, uniqueOut);
}
return 0;
}
private Boolean step1(Path in, Path out) throws IOException, InterruptedException, ClassNotFoundException {
Job job = Job.getInstance(getConf(), "word count job");
FileInputFormat.setInputPaths(job,in);
FileOutputFormat.setOutputPath(job, out);
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
job.setPartitionerClass(GroupByWord.KeyPartitioner.class);
job.setGroupingComparatorClass(GroupByWord.GroupComparator.class);
job.setJarByClass(WordJob.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
return job.waitForCompletion(true);
}
private void step2(Path out, Path uniqueOut) throws IOException, InterruptedException, ClassNotFoundException {
Job uniqueWordCountJob = Job.getInstance(getConf(), "unique word count job");
FileInputFormat.setInputPaths(uniqueWordCountJob,out);
FileOutputFormat.setOutputPath(uniqueWordCountJob, uniqueOut);
uniqueWordCountJob.setMapperClass(UniqueWordCountMapper.class);
uniqueWordCountJob.setReducerClass(UniqueWordCountReducer.class);
uniqueWordCountJob.setPartitionerClass(GroupByInitials.KeyPartitioner.class);
uniqueWordCountJob.setGroupingComparatorClass(GroupByInitials.GroupComparator.class);
uniqueWordCountJob.setJarByClass(WordJob.class);
uniqueWordCountJob.setOutputKeyClass(Text.class);
uniqueWordCountJob.setOutputValueClass(Text.class);
uniqueWordCountJob.waitForCompletion(true);
}
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
ToolRunner.run(new WordJob(), args);
}
}