MapReduce功能实现五---去重(Distinct)、计数(Count)

MapReduce功能实现系列
      MapReduce功能实现一—Hbase和Hdfs之间数据相互转换
      MapReduce功能实现二—排序
      MapReduce功能实现三—Top N
      MapReduce功能实现四—小综合(从hbase中读取数据统计并在hdfs中降序输出Top 3)
      MapReduce功能实现五—去重(Distinct)、计数(Count)
      MapReduce功能实现六—最大值(Max)、求和(Sum)、平均值(Avg)
      MapReduce功能实现七—小综合(多个job串行处理计算平均值)
      MapReduce功能实现八—分区(Partition)
      MapReduce功能实现九—Pv、Uv
      MapReduce功能实现十—倒排索引(Inverted Index)
      MapReduce功能实现十一—join
 

一、去重

类似于db中的select distinct(x) from table , 去重处理甚至比WordCount还要简单:

[hadoop@h71 q1]$ vi hello.txt
hello world
hello hadoop
hello hive
hello hadoop
hello world
hello world
[hadoop@h71 q1]$ hadoop fs -mkdir /user/hadoop/dedup_in
[hadoop@h71 q1]$ hadoop fs -put hello.txt /user/hadoop/dedup_in

java代码:

import java.io.IOException;
 
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
 
public class Dedup {
 
    public static class RemoveDupMapper extends Mapper<Object, Text, Text, NullWritable> {
    	public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
    		context.write(value, NullWritable.get());
    	}
    }
 
    public static class RemoveDupReducer extends Reducer<Text, NullWritable, Text, NullWritable> {
    	public void reduce(Text key, Iterable<NullWritable> values, Context context)
        throws IOException, InterruptedException {
    		context.write(key, NullWritable.get());
    	}
    }
 
    public static void main(String[] args) throws Exception{
        Configuration conf = new Configuration();
        conf.set("mapred.jar","Dedup.jar");   //去掉这行也能运行,目前还不知道这行有什么用
        String[] ioArgs=new String[]{"dedup_in","dedup_out"};
        String[] otherArgs = new GenericOptionsParser(conf, ioArgs).getRemainingArgs();
        if (otherArgs.length != 2) {
        	System.err.println("Usage: Data Deduplication <in> <out>");
        	System.exit(2);
        }
        
        Job job = new Job(conf, "Data Deduplication");
        job.setJarByClass(Dedup.class);
 
        //设置Map、Combine和Reduce处理类
        job.setMapperClass(RemoveDupMapper.class);
        job.setCombinerClass(RemoveDupReducer.class);
        job.setReducerClass(RemoveDupReducer.class);
 
        //设置输出类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);
 
        //设置输入和输出目录
        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
	}
}

在Linux中运行代码:

[hadoop@h71 q1]$ /usr/jdk1.7.0_25/bin/javac Dedup.java 
[hadoop@h71 q1]$ /usr/jdk1.7.0_25/bin/jar cvf xx.jar Dedup*class
[hadoop@h71 q1]$ hadoop jar xx.jar Dedup

查看结果:

[hadoop@h71 q1]$ hadoop fs -cat /user/hadoop/dedup_out/part-r-00000
hello hadoop
hello hive
hello world

二、计数器的使用

[hadoop@h71 q1]$ hadoop fs -mkdir /user/hadoop/mapinput
[hadoop@h71 q1]$ hadoop fs -put hello.txt /user/hadoop/mapinput

java代码:

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 
/**
 * mapreduce中计数器的使用
 */
public class WordCountApp {
 
	private static final String INPUT_PATH = "hdfs://h71:9000/user/hadoop/mapinput";
	private static final String OUTPUT_PATH = "hdfs://h71:9000/user/hadoop/mapoutput";
	
	public static void main(String[] args) throws IOException, URISyntaxException, 
	      ClassNotFoundException, InterruptedException {
		Configuration conf = new Configuration();
		conf.set("mapred.jar","wcapp.jar");		
 
		final FileSystem fileSystem = FileSystem.get(new URI(OUTPUT_PATH), conf);
		fileSystem.delete(new Path(OUTPUT_PATH), true);
		
		final Job job = new Job(conf, WordCountApp.class.getSimpleName());
		job.setJarByClass(WordCountApp.class);
		
		FileInputFormat.setInputPaths(job, INPUT_PATH);
		job.setMapperClass(MyMapper.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(LongWritable.class);
		
		job.setReducerClass(MyReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(LongWritable.class);
		FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH));
		
		job.waitForCompletion(true);
	}
 
	public static class MyMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			final String line = value.toString();
			StringTokenizer tokenizer = new StringTokenizer(line);
			final Counter counter = context.getCounter("Sensitive", "hello");
			if (value.toString().contains("hello")) {
				counter.increment(1L);   //当查询到包含hello的词语时,计数器加1
			}
			while(tokenizer.hasMoreTokens()) {
				String target = tokenizer.nextToken();
				if(target.equals("hello")){		//只过滤输出hello的计数
				context.write(new Text(target), new LongWritable(1));
				}
			}
		}
	}
	
	public static class MyReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
		@Override
		protected void reduce(Text key, Iterable<LongWritable> value,
				Reducer<Text, LongWritable, Text, LongWritable>.Context context)
				throws IOException, InterruptedException {
			long times = 0l;
			while (value.iterator().hasNext()) {
				times += value.iterator().next().get();
			}
			context.write(key, new LongWritable(times));
		}
	}
}
[hadoop@h71 q1]$ /usr/jdk1.7.0_25/bin/javac WordCountApp.java 
[hadoop@h71 q1]$ /usr/jdk1.7.0_25/bin/jar cvf xx.jar WordCountApp*class
[hadoop@h71 q1]$ hadoop jar xx.jar WordCountApp

在控制台打印的信息中你会看到:

        Sensitive
                hello=6
[hadoop@h71 q1]$ hadoop fs -cat /user/hadoop/mapoutput/part-r-00000
hello   6

三、记录计数(Count)

[hadoop@h71 q1]$ vi ceshi.txt
2
8
8
3
2
3
5
3
0
2
7
[hadoop@h71 q1]$ hadoop fs -put ceshi.txt /input

这个跟WordCount略有不同,类似于Select Count(*) from tables的效果,代码也超级简单,直接拿WordCount改一改就行了:

import java.io.IOException;
 
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
 
public class RowCount {
	
    public static class RowCountMapper extends Mapper<Object, Text, Text, IntWritable> {
        private final static IntWritable one = new IntWritable(1);
        private final  static Text countKey = new Text("count");
        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
                context.write(countKey, one);
        }
    }
 
    public static class RowCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        private IntWritable result = new IntWritable();
        public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable val : values) {
                sum += val.get();
            }
            result.set(sum);
            context.write(key, result);
        }
    }
 
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if (otherArgs.length < 2) {
            System.err.println("Usage: RowCount <in> [<in>...] <out>");
            System.exit(2);
        }
 
        Job job = Job.getInstance(conf, "word count");
        job.setJarByClass(RowCount.class);
        job.setMapperClass(RowCountMapper.class);
        job.setCombinerClass(RowCountReducer.class);
        job.setReducerClass(RowCountReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        for (int i = 0; i < otherArgs.length - 1; ++i) {
            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
        }
        FileOutputFormat.setOutputPath(job,
                new Path(otherArgs[otherArgs.length - 1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}
[hadoop@h71 q1]$ /usr/jdk1.7.0_25/bin/javac RowCount.java 
[hadoop@h71 q1]$ /usr/jdk1.7.0_25/bin/jar cvf xx.jar RowCount*class
[hadoop@h71 q1]$ hadoop jar xx.jar RowCount /input/ceshi.txt /output

[hadoop@h71 q1]$ hadoop fs -cat /output/part-r-00000
count   11

注:如果只想输出一个数字,不需要"count"这个key,可以改进一下:

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
 
import java.io.IOException;
 
public class RowCount2 {
 
    public static class RowCount2Mapper extends Mapper<LongWritable, Text, LongWritable, NullWritable> {
        public long count = 0;
        public void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            count += 1;
        }
        protected void cleanup(Context context) throws IOException, InterruptedException {
            context.write(new LongWritable(count), NullWritable.get());
        }
    }
 
    public static class RowCount2Reducer extends Reducer<LongWritable, NullWritable, LongWritable, NullWritable> {
        public long count = 0;
        public void reduce(LongWritable key, Iterable<NullWritable> values, Context context)
                throws IOException, InterruptedException {
            count += key.get();
        }
        protected void cleanup(Context context) throws IOException, InterruptedException {
            context.write(new LongWritable(count), NullWritable.get());
        }
    }
 
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if (otherArgs.length < 2) {
            System.err.println("Usage: FindMax <in> [<in>...] <out>");
            System.exit(2);
        }
 
        Job job = Job.getInstance(conf, "RowCount2");
        job.setJarByClass(RowCount2.class);
        job.setMapperClass(RowCount2Mapper.class);
        job.setCombinerClass(RowCount2Reducer.class);
        job.setReducerClass(RowCount2Reducer.class);
        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(NullWritable.class);
 
        for (int i = 0; i < otherArgs.length - 1; ++i) {
            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
        }
        FileOutputFormat.setOutputPath(job,
                new Path(otherArgs[otherArgs.length - 1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}
[hadoop@h71 q1]$ hadoop fs -cat /output/part-r-00000
11

这样输出结果就只有一个数字11了。

注意:这里context.write(xxx)只能写在cleanup方法中,该方法在Mapper和Reducer接口中都有,在map方法及reduce方法执行完后,会触发cleanup方法,大家可以尝试下,把context.write(xxx)写在map和reduce方法中试试看,结果会出现多行记录,而不是预期的仅1个数字。

  • 6
    点赞
  • 49
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

小强签名设计

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值