MapReduce功能实现五---去重(Distinct)、计数(Count)

最新推荐文章于 2024-07-30 09:01:15 发布

小强签名设计

最新推荐文章于 2024-07-30 09:01:15 发布

阅读量1.3w

点赞数 6

分类专栏： hadoop MapReduce功能实现文章标签： mapreduce 去重Distinct 计数Count

本文链接：https://blog.csdn.net/m0_37739193/article/details/76102486

版权

hadoop 同时被 2 个专栏收录

20 篇文章 4 订阅

订阅专栏

MapReduce功能实现

11 篇文章 3 订阅

订阅专栏

一、去重

类似于db中的select distinct(x) from table , 去重处理甚至比WordCount还要简单：

[hadoop@h71 q1]$ vi hello.txt
hello world
hello hadoop
hello hive
hello hadoop
hello world
hello world
[hadoop@h71 q1]$ hadoop fs -mkdir /user/hadoop/dedup_in
[hadoop@h71 q1]$ hadoop fs -put hello.txt /user/hadoop/dedup_in

java代码：

import java.io.IOException;
 
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
 
public class Dedup {
 
    public static class RemoveDupMapper extends Mapper<Object, Text, Text, NullWritable> {
    	public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
    		context.write(value, NullWritable.get());
    	}
    }
 
    public static class RemoveDupReducer extends Reducer<Text, NullWritable, Text, NullWritable> {
    	public void reduce(Text key, Iterable<NullWritable> values, Context context)
        throws IOException, InterruptedException {
    		context.write(key, NullWritable.get());
    	}
    }
 
    public static void main(String[] args) throws Exception{
        Configuration conf = new Configuration();
        conf.set("mapred.jar","Dedup.jar");   //去掉这行也能运行，目前还不知道这行有什么用
        String[] ioArgs=new String[]{"dedup_in","dedup_out"};
        String[] otherArgs = new GenericOptionsParser(conf, ioArgs).getRemainingArgs();
        if (otherArgs.length != 2) {
        	System.err.println("Usage: Data Deduplication <in> <out>");
        	System.exit(2);
        }
        
        Job job = new Job(conf, "Data Deduplication");
        job.setJarByClass(Dedup.class);
 
        //设置Map、Combine和Reduce处理类
        job.setMapperClass(RemoveDupMapper.class);
        job.setCombinerClass(RemoveDupReducer.class);
        job.setReducerClass(RemoveDupReducer.class);
 
        //设置输出类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);
 
        //设置输入和输出目录
        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
	}
}

在Linux中运行代码：

[hadoop@h71 q1]$ /usr/jdk1.7.0_25/bin/javac Dedup.java 
[hadoop@h71 q1]$ /usr/jdk1.7.0_25/bin/jar cvf xx.jar Dedup*class
[hadoop@h71 q1]$ hadoop jar xx.jar Dedup

查看结果：

[hadoop@h71 q1]$ hadoop fs -cat /user/hadoop/dedup_out/part-r-00000
hello hadoop
hello hive
hello world

二、计数器的使用

[hadoop@h71 q1]$ hadoop fs -mkdir /user/hadoop/mapinput
[hadoop@h71 q1]$ hadoop fs -put hello.txt /user/hadoop/mapinput

java代码：

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 
/**
 * mapreduce中计数器的使用
 */
public class WordCountApp {
 
	private static final String INPUT_PATH = "hdfs://h71:9000/user/hadoop/mapinput";
	private static final String OUTPUT_PATH = "hdfs://h71:9000/user/hadoop/mapoutput";
	
	public static void main(String[] args) throws IOException, URISyntaxException, 
	      ClassNotFoundException, InterruptedException {
		Configuration conf = new Configuration();
		conf.set("mapred.jar","wcapp.jar");		
 
		final FileSystem fileSystem = FileSystem.get(new URI(OUTPUT_PATH), conf);
		fileSystem.delete(new Path(OUTPUT_PATH), true);
		
		final Job job = new Job(conf, WordCountApp.class.getSimpleName());
		job.setJarByClass(WordCountApp.class);
		
		FileInputFormat.setInputPaths(job, INPUT_PATH);
		job.setMapperClass(MyMapper.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(LongWritable.class);
		
		job.setReducerClass(MyReducer.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(LongWritable.class);
		FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH));
		
		job.waitForCompletion(true);
	}
 
	public static class MyMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			final String line = value.toString();
			StringTokenizer tokenizer = new StringTokenizer(line);
			final Counter counter = context.getCounter("Sensitive", "hello");
			if (value.toString().contains("hello")) {
				counter.increment(1L);   //当查询到包含hello的词语时，计数器加1
			}
			while(tokenizer.hasMoreTokens()) {
				String target = tokenizer.nextToken();
				if(target.equals("hello")){		//只过滤输出hello的计数
				context.write(new Text(target), new LongWritable(1));
				}
			}
		}
	}
	
	public static class MyReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
		@Override
		protected void reduce(Text key, Iterable<LongWritable> value,
				Reducer<Text, LongWritable, Text, LongWritable>.Context context)
				throws IOException, InterruptedException {
			long times = 0l;
			while (value.iterator().hasNext()) {
				times += value.iterator().next().get();
			}
			context.write(key, new LongWritable(times));
		}
	}
}

[hadoop@h71 q1]$ /usr/jdk1.7.0_25/bin/javac WordCountApp.java 
[hadoop@h71 q1]$ /usr/jdk1.7.0_25/bin/jar cvf xx.jar WordCountApp*class
[hadoop@h71 q1]$ hadoop jar xx.jar WordCountApp

在控制台打印的信息中你会看到：

        Sensitive
                hello=6
[hadoop@h71 q1]$ hadoop fs -cat /user/hadoop/mapoutput/part-r-00000
hello   6

三、记录计数(Count)

[hadoop@h71 q1]$ vi ceshi.txt
2
8
8
3
2
3
5
3
0
2
7
[hadoop@h71 q1]$ hadoop fs -put ceshi.txt /input

这个跟WordCount略有不同,类似于Select Count(*) from tables的效果,代码也超级简单，直接拿WordCount改一改就行了：

import java.io.IOException;
 
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
 
public class RowCount {
	
    public static class RowCountMapper extends Mapper<Object, Text, Text, IntWritable> {
        private final static IntWritable one = new IntWritable(1);
        private final  static Text countKey = new Text("count");
        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
                context.write(countKey, one);
        }
    }
 
    public static class RowCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        private IntWritable result = new IntWritable();
        public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable val : values) {
                sum += val.get();
            }
            result.set(sum);
            context.write(key, result);
        }
    }
 
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if (otherArgs.length < 2) {
            System.err.println("Usage: RowCount <in> [<in>...] <out>");
            System.exit(2);
        }
 
        Job job = Job.getInstance(conf, "word count");
        job.setJarByClass(RowCount.class);
        job.setMapperClass(RowCountMapper.class);
        job.setCombinerClass(RowCountReducer.class);
        job.setReducerClass(RowCountReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        for (int i = 0; i < otherArgs.length - 1; ++i) {
            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
        }
        FileOutputFormat.setOutputPath(job,
                new Path(otherArgs[otherArgs.length - 1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

[hadoop@h71 q1]$ /usr/jdk1.7.0_25/bin/javac RowCount.java 
[hadoop@h71 q1]$ /usr/jdk1.7.0_25/bin/jar cvf xx.jar RowCount*class
[hadoop@h71 q1]$ hadoop jar xx.jar RowCount /input/ceshi.txt /output

[hadoop@h71 q1]$ hadoop fs -cat /output/part-r-00000
count   11

注：如果只想输出一个数字，不需要"count"这个key，可以改进一下：

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
 
import java.io.IOException;
 
public class RowCount2 {
 
    public static class RowCount2Mapper extends Mapper<LongWritable, Text, LongWritable, NullWritable> {
        public long count = 0;
        public void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            count += 1;
        }
        protected void cleanup(Context context) throws IOException, InterruptedException {
            context.write(new LongWritable(count), NullWritable.get());
        }
    }
 
    public static class RowCount2Reducer extends Reducer<LongWritable, NullWritable, LongWritable, NullWritable> {
        public long count = 0;
        public void reduce(LongWritable key, Iterable<NullWritable> values, Context context)
                throws IOException, InterruptedException {
            count += key.get();
        }
        protected void cleanup(Context context) throws IOException, InterruptedException {
            context.write(new LongWritable(count), NullWritable.get());
        }
    }
 
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if (otherArgs.length < 2) {
            System.err.println("Usage: FindMax <in> [<in>...] <out>");
            System.exit(2);
        }
 
        Job job = Job.getInstance(conf, "RowCount2");
        job.setJarByClass(RowCount2.class);
        job.setMapperClass(RowCount2Mapper.class);
        job.setCombinerClass(RowCount2Reducer.class);
        job.setReducerClass(RowCount2Reducer.class);
        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(NullWritable.class);
 
        for (int i = 0; i < otherArgs.length - 1; ++i) {
            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
        }
        FileOutputFormat.setOutputPath(job,
                new Path(otherArgs[otherArgs.length - 1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

[hadoop@h71 q1]$ hadoop fs -cat /output/part-r-00000
11

这样输出结果就只有一个数字11了。

注意：这里context.write(xxx)只能写在cleanup方法中，该方法在Mapper和Reducer接口中都有，在map方法及reduce方法执行完后，会触发cleanup方法，大家可以尝试下，把context.write(xxx)写在map和reduce方法中试试看，结果会出现多行记录，而不是预期的仅1个数字。