感谢作者做的那么清晰易懂
http://blog.csdn.net/sn_zzy/article/details/43446027
Group By原理
map阶段
把需要group by的多个字段组合变成一个key
reduce字段
对组合的新key进行count
distinct原理
select dealid, count(distinct uid) num from order group by dealid;
map阶段
按照dealid+uid作为一个key进行map,然后对partition key进行shuffle
reduce阶段
把原来map中key进行拆开把dealid作为key然后对新的key进行reduce
package mapreducelearn;
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.HashMap;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class GroupByWordCount {
/**
* MapReduceBase类:实现了Mapper和Reducer接口的基类(其中的方法只是实现接口,而未作任何事情) Mapper接口:
* WritableComparable接口:实现WritableComparable的类可以相互比较。所有被用作key的类应该实现此接口。
* Reporter 则可用于报告整个应用的运行进度,本例中未使用。
*
*/
public static class TokenizerMapper extends
Mapper<Object, Text, Text, IntWritable> {
/**
* LongWritable, IntWritable, Text 均是 Hadoop 中实现的用于封装 Java
* 数据类型的类,这些类实现了WritableComparable接口,
* 都能够被串行化从而便于在分布式环境中进行数据交换,你可以将它们分别视为long,int,String 的替代品。
*/
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
/**
* Mapper接口中的map方法: void map(K1 key, V1 value, OutputCollector<K2,V2>
* output, Reporter reporter) 映射一个单个的输入k/v对到一个中间的k/v对
* 输出对不需要和输入对是相同的类型,输入对可以映射到0个或多个输出对。
* OutputCollector接口:收集Mapper和Reducer输出的<k,v>对。
* OutputCollector接口的collect(k, v)方法:增加一个(k,v)对到output
*/
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
/**
* 原始数据: c++ java hello world java hello you me too
* map阶段,数据如下形式作为map的输入值:key为偏移量 0 c++ java hello 16 world java
* hello 34 you me too
*/
/**
* 以下解析键值对 解析后以键值对格式形成输出数据 格式如下:前者是键排好序的,后者数字是值 c++ 1 java 1 hello 1
* world 1 java 1 hello 1 you 1 me 1 too 1 这些数据作为reduce的输出数据
*/
// String[] str=value.toString().split("#");
HashMap<Text,Integer> hashmap=new HashMap<Text,Integer>();
StringTokenizer itr = new StringTokenizer(value.toString());
// System.out.println("value什么东西 : "+value.toString());
// System.out.println("key什么东西 : "+key.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
if(hashmap.containsKey(word)){
hashmap.put(word, hashmap.get(word)+1);
}else{
hashmap.put(word, 1);
}
}
for(Text wordkey:hashmap.keySet()){
context.write(wordkey,new IntWritable(hashmap.get(wordkey)));
}
}
}
static class UserAndPostWritable implements Writable{
/**
* 类型 U表示用户,P表示帖子
*/
private String type;
private String data;
public UserAndPostWritable()
{
}
public UserAndPostWritable(String type, String data)
{
super();
this.type = type;
this.data = data;
}
public String getType()
{
return type;
}
public void setType(String type)
{
this.type = type;
}
public String getData()
{
return data;
}
public void setData(String data)
{
this.data = data;
}
@Override
public void readFields(DataInput input) throws IOException
{
type = input.readUTF();
data = input.readUTF();
}
@Override
public void write(DataOutput output) throws IOException
{
output.writeUTF(type);
output.writeUTF(data);
}
}
public static class IntSumReducer extends
Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
/**
* reduce过程是对输入数据解析形成如下格式数据: (c++ [1]) (java [1,1]) (hello [1,1]) (world
* [1]) (you [1]) (me [1]) (you [1]) 供接下来的实现的reduce程序分析数据数据
*
*/
public void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int sum = 0;
/**
* 自己的实现的reduce方法分析输入数据 形成数据格式如下并存储 c++ 1 hello 2 java 2 me 1 too 1
* world 1 you 1
*
*/
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
/**
* JobConf:map/reduce的job配置类,向hadoop框架描述map-reduce执行的工作
* 构造方法:JobConf()、JobConf(Class exampleClass)、JobConf(Configuration
* conf)等
*/
Configuration conf = new Configuration();
// System.setProperty("hadoop.home.dir",
// "D:/linux/hadoop-2.6.4/hadoop-2.6.4");
String[] otherArgs = new GenericOptionsParser(conf, args)
.getRemainingArgs();
if (otherArgs.length < 2) {
System.err.println("Usage: wordcount <in> [<in>...] <out>");
System.exit(2);
}
Job job = new Job(conf, "word count");// Job(Configuration conf, String
// jobName) 设置job名称和
job.setJarByClass(GroupByWordCount.class);
job.setMapperClass(TokenizerMapper.class);// 为job设置Mapper类
job.setCombinerClass(IntSumReducer.class); // 为job设置Combiner类
job.setReducerClass(IntSumReducer.class);// 为job设置Reduce类
job.setOutputKeyClass(Text.class); // 设置输出key的类型
job.setOutputValueClass(IntWritable.class);// 设置输出value的类型
for (int i = 0; i < otherArgs.length - 1; ++i) {
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));// 为map-reduce任务设置InputFormat实现类
// 设置输入路径
;// 为map-reduce任务设置OutputFormat实现类 设置输出路径
}
FileOutputFormat.setOutputPath(job, new Path(
otherArgs[otherArgs.length - 1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}