MapReduce功能实现系列:
MapReduce功能实现一—Hbase和Hdfs之间数据相互转换
MapReduce功能实现二—排序
MapReduce功能实现三—Top N
MapReduce功能实现四—小综合(从hbase中读取数据统计并在hdfs中降序输出Top 3)
MapReduce功能实现五—去重(Distinct)、计数(Count)
MapReduce功能实现六—最大值(Max)、求和(Sum)、平均值(Avg)
MapReduce功能实现七—小综合(多个job串行处理计算平均值)
MapReduce功能实现八—分区(Partition)
MapReduce功能实现九—Pv、Uv
MapReduce功能实现十—倒排索引(Inverted Index)
MapReduce功能实现十一—join
一、去重
类似于db中的select distinct(x) from table , 去重处理甚至比WordCount还要简单:
[hadoop@h71 q1]$ vi hello.txt
hello world
hello hadoop
hello hive
hello hadoop
hello world
hello world
[hadoop@h71 q1]$ hadoop fs -mkdir /user/hadoop/dedup_in
[hadoop@h71 q1]$ hadoop fs -put hello.txt /user/hadoop/dedup_in
java代码:
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class Dedup {
public static class RemoveDupMapper extends Mapper<Object, Text, Text, NullWritable> {
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
context.write(value, NullWritable.get());
}
}
public static class RemoveDupReducer extends Reducer<Text, NullWritable, Text, NullWritable> {
public void reduce(Text key, Iterable<NullWritable> values, Context context)
throws IOException, InterruptedException {
context.write(key, NullWritable.get());
}
}
public static void main(String[] args) throws Exception{
Configuration conf = new Configuration();
conf.set("mapred.jar","Dedup.jar"); //去掉这行也能运行,目前还不知道这行有什么用
String[] ioArgs=new String[]{"dedup_in","dedup_out"};
String[] otherArgs = new GenericOptionsParser(conf, ioArgs).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: Data Deduplication <in> <out>");
System.exit(2);
}
Job job = new Job(conf, "Data Deduplication");
job.setJarByClass(Dedup.class);
//设置Map、Combine和Reduce处理类
job.setMapperClass(RemoveDupMapper.class);
job.setCombinerClass(RemoveDupReducer.class);
job.setReducerClass(RemoveDupReducer.class);
//设置输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
//设置输入和输出目录
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
在Linux中运行代码:
[hadoop@h71 q1]$ /usr/jdk1.7.0_25/bin/javac Dedup.java
[hadoop@h71 q1]$ /usr/jdk1.7.0_25/bin/jar cvf xx.jar Dedup*class
[hadoop@h71 q1]$ hadoop jar xx.jar Dedup
查看结果:
[hadoop@h71 q1]$ hadoop fs -cat /user/hadoop/dedup_out/part-r-00000
hello hadoop
hello hive
hello world
二、计数器的使用
[hadoop@h71 q1]$ hadoop fs -mkdir /user/hadoop/mapinput
[hadoop@h71 q1]$ hadoop fs -put hello.txt /user/hadoop/mapinput
java代码:
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* mapreduce中计数器的使用
*/
public class WordCountApp {
private static final String INPUT_PATH = "hdfs://h71:9000/user/hadoop/mapinput";
private static final String OUTPUT_PATH = "hdfs://h71:9000/user/hadoop/mapoutput";
public static void main(String[] args) throws IOException, URISyntaxException,
ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
conf.set("mapred.jar","wcapp.jar");
final FileSystem fileSystem = FileSystem.get(new URI(OUTPUT_PATH), conf);
fileSystem.delete(new Path(OUTPUT_PATH), true);
final Job job = new Job(conf, WordCountApp.class.getSimpleName());
job.setJarByClass(WordCountApp.class);
FileInputFormat.setInputPaths(job, INPUT_PATH);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(LongWritable.class);
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH));
job.waitForCompletion(true);
}
public static class MyMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
final String line = value.toString();
StringTokenizer tokenizer = new StringTokenizer(line);
final Counter counter = context.getCounter("Sensitive", "hello");
if (value.toString().contains("hello")) {
counter.increment(1L); //当查询到包含hello的词语时,计数器加1
}
while(tokenizer.hasMoreTokens()) {
String target = tokenizer.nextToken();
if(target.equals("hello")){ //只过滤输出hello的计数
context.write(new Text(target), new LongWritable(1));
}
}
}
}
public static class MyReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
@Override
protected void reduce(Text key, Iterable<LongWritable> value,
Reducer<Text, LongWritable, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
long times = 0l;
while (value.iterator().hasNext()) {
times += value.iterator().next().get();
}
context.write(key, new LongWritable(times));
}
}
}
[hadoop@h71 q1]$ /usr/jdk1.7.0_25/bin/javac WordCountApp.java
[hadoop@h71 q1]$ /usr/jdk1.7.0_25/bin/jar cvf xx.jar WordCountApp*class
[hadoop@h71 q1]$ hadoop jar xx.jar WordCountApp
在控制台打印的信息中你会看到:
Sensitive
hello=6
[hadoop@h71 q1]$ hadoop fs -cat /user/hadoop/mapoutput/part-r-00000
hello 6
三、记录计数(Count)
[hadoop@h71 q1]$ vi ceshi.txt
2
8
8
3
2
3
5
3
0
2
7
[hadoop@h71 q1]$ hadoop fs -put ceshi.txt /input
这个跟WordCount略有不同,类似于Select Count(*) from tables的效果,代码也超级简单,直接拿WordCount改一改就行了:
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class RowCount {
public static class RowCountMapper extends Mapper<Object, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private final static Text countKey = new Text("count");
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
context.write(countKey, one);
}
}
public static class RowCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length < 2) {
System.err.println("Usage: RowCount <in> [<in>...] <out>");
System.exit(2);
}
Job job = Job.getInstance(conf, "word count");
job.setJarByClass(RowCount.class);
job.setMapperClass(RowCountMapper.class);
job.setCombinerClass(RowCountReducer.class);
job.setReducerClass(RowCountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
for (int i = 0; i < otherArgs.length - 1; ++i) {
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
}
FileOutputFormat.setOutputPath(job,
new Path(otherArgs[otherArgs.length - 1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
[hadoop@h71 q1]$ /usr/jdk1.7.0_25/bin/javac RowCount.java
[hadoop@h71 q1]$ /usr/jdk1.7.0_25/bin/jar cvf xx.jar RowCount*class
[hadoop@h71 q1]$ hadoop jar xx.jar RowCount /input/ceshi.txt /output
[hadoop@h71 q1]$ hadoop fs -cat /output/part-r-00000
count 11
注:如果只想输出一个数字,不需要"count"这个key,可以改进一下:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import java.io.IOException;
public class RowCount2 {
public static class RowCount2Mapper extends Mapper<LongWritable, Text, LongWritable, NullWritable> {
public long count = 0;
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
count += 1;
}
protected void cleanup(Context context) throws IOException, InterruptedException {
context.write(new LongWritable(count), NullWritable.get());
}
}
public static class RowCount2Reducer extends Reducer<LongWritable, NullWritable, LongWritable, NullWritable> {
public long count = 0;
public void reduce(LongWritable key, Iterable<NullWritable> values, Context context)
throws IOException, InterruptedException {
count += key.get();
}
protected void cleanup(Context context) throws IOException, InterruptedException {
context.write(new LongWritable(count), NullWritable.get());
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length < 2) {
System.err.println("Usage: FindMax <in> [<in>...] <out>");
System.exit(2);
}
Job job = Job.getInstance(conf, "RowCount2");
job.setJarByClass(RowCount2.class);
job.setMapperClass(RowCount2Mapper.class);
job.setCombinerClass(RowCount2Reducer.class);
job.setReducerClass(RowCount2Reducer.class);
job.setOutputKeyClass(LongWritable.class);
job.setOutputValueClass(NullWritable.class);
for (int i = 0; i < otherArgs.length - 1; ++i) {
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
}
FileOutputFormat.setOutputPath(job,
new Path(otherArgs[otherArgs.length - 1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
[hadoop@h71 q1]$ hadoop fs -cat /output/part-r-00000
11
这样输出结果就只有一个数字11了。
注意:这里context.write(xxx)只能写在cleanup方法中,该方法在Mapper和Reducer接口中都有,在map方法及reduce方法执行完后,会触发cleanup方法,大家可以尝试下,把context.write(xxx)写在map和reduce方法中试试看,结果会出现多行记录,而不是预期的仅1个数字。