MapReduce实现基本SQL操作的原理-join和group by，以及Dinstinct

最新推荐文章于 2021-08-13 09:38:16 发布

FIXLS

最新推荐文章于 2021-08-13 09:38:16 发布

阅读量1.2k

点赞数 1

分类专栏： Hadoop 文章标签： mapreduce join groupby distinct

Hadoop 专栏收录该内容

36 篇文章 0 订阅

订阅专栏

感谢作者做的那么清晰易懂

http://blog.csdn.net/sn_zzy/article/details/43446027

Group By原理
map阶段
把需要group by的多个字段组合变成一个key
reduce字段
对组合的新key进行count
distinct原理
select dealid, count(distinct uid) num from order group by dealid;
map阶段
按照dealid+uid作为一个key进行map,然后对partition key进行shuffle
reduce阶段
把原来map中key进行拆开把dealid作为key然后对新的key进行reduce

package mapreducelearn;

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.HashMap;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class GroupByWordCount {
    /**
     * MapReduceBase类:实现了Mapper和Reducer接口的基类（其中的方法只是实现接口，而未作任何事情） Mapper接口：
     * WritableComparable接口：实现WritableComparable的类可以相互比较。所有被用作key的类应该实现此接口。
     * Reporter 则可用于报告整个应用的运行进度，本例中未使用。
     * 
     */
    public static class TokenizerMapper extends
            Mapper<Object, Text, Text, IntWritable> {
        /**
         * LongWritable, IntWritable, Text 均是 Hadoop 中实现的用于封装 Java
         * 数据类型的类，这些类实现了WritableComparable接口，
         * 都能够被串行化从而便于在分布式环境中进行数据交换，你可以将它们分别视为long,int,String 的替代品。
         */
        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();

        /**
         * Mapper接口中的map方法： void map(K1 key, V1 value, OutputCollector<K2,V2>
         * output, Reporter reporter) 映射一个单个的输入k/v对到一个中间的k/v对
         * 输出对不需要和输入对是相同的类型，输入对可以映射到0个或多个输出对。
         * OutputCollector接口：收集Mapper和Reducer输出的<k,v>对。
         * OutputCollector接口的collect(k, v)方法:增加一个(k,v)对到output
         */
        public void map(Object key, Text value, Context context)
                throws IOException, InterruptedException {
            /**
             * 原始数据： c++ java hello world java hello you me too
             * map阶段，数据如下形式作为map的输入值：key为偏移量 0 c++ java hello 16 world java
             * hello 34 you me too
             */

            /**
             * 以下解析键值对 解析后以键值对格式形成输出数据 格式如下：前者是键排好序的，后者数字是值 c++ 1 java 1 hello 1
             * world 1 java 1 hello 1 you 1 me 1 too 1 这些数据作为reduce的输出数据
             */
//          String[] str=value.toString().split("#");
            HashMap<Text,Integer> hashmap=new HashMap<Text,Integer>();
            StringTokenizer itr = new StringTokenizer(value.toString());
//            System.out.println("value什么东西 ： "+value.toString());
            // System.out.println("key什么东西 ： "+key.toString());

            while (itr.hasMoreTokens()) {
                word.set(itr.nextToken());
                if(hashmap.containsKey(word)){

                hashmap.put(word, hashmap.get(word)+1);
                }else{
                    hashmap.put(word, 1);
                }
            }
            for(Text wordkey:hashmap.keySet()){
                context.write(wordkey,new IntWritable(hashmap.get(wordkey)));
            }
        }
    }
    static class UserAndPostWritable implements Writable{


        /**
         * 类型 U表示用户,P表示帖子
         */
        private String type;
        private String data;

        public UserAndPostWritable()
        {

        }

        public UserAndPostWritable(String type, String data)
        {
            super();
            this.type = type;
            this.data = data;
        }

        public String getType()
        {
            return type;
        }

        public void setType(String type)
        {
            this.type = type;
        }

        public String getData()
        {
            return data;
        }

        public void setData(String data)
        {
            this.data = data;
        }

        @Override
        public void readFields(DataInput input) throws IOException
        {
            type = input.readUTF();
            data = input.readUTF();
        }

        @Override
        public void write(DataOutput output) throws IOException
        {
            output.writeUTF(type);
            output.writeUTF(data);
        }


    }
    public static class IntSumReducer extends
            Reducer<Text, IntWritable, Text, IntWritable> {
        private IntWritable result = new IntWritable();

        /**
         * reduce过程是对输入数据解析形成如下格式数据： (c++ [1]) (java [1,1]) (hello [1,1]) (world
         * [1]) (you [1]) (me [1]) (you [1]) 供接下来的实现的reduce程序分析数据数据
         * 
         */
        public void reduce(Text key, Iterable<IntWritable> values,
                Context context) throws IOException, InterruptedException {
            int sum = 0;
            /**
             * 自己的实现的reduce方法分析输入数据 形成数据格式如下并存储 c++ 1 hello 2 java 2 me 1 too 1
             * world 1 you 1
             * 
             */
            for (IntWritable val : values) {
                sum += val.get();
            }
            result.set(sum);
            context.write(key, result);
        }
    }

    public static void main(String[] args) throws Exception {
        /**
         * JobConf：map/reduce的job配置类，向hadoop框架描述map-reduce执行的工作
         * 构造方法：JobConf()、JobConf(Class exampleClass)、JobConf(Configuration
         * conf)等
         */

        Configuration conf = new Configuration();
        // System.setProperty("hadoop.home.dir",
        // "D:/linux/hadoop-2.6.4/hadoop-2.6.4");
        String[] otherArgs = new GenericOptionsParser(conf, args)
                .getRemainingArgs();
        if (otherArgs.length < 2) {
            System.err.println("Usage: wordcount <in> [<in>...] <out>");
            System.exit(2);
        }
        Job job = new Job(conf, "word count");// Job(Configuration conf, String
                                                // jobName) 设置job名称和
        job.setJarByClass(GroupByWordCount.class);
        job.setMapperClass(TokenizerMapper.class);// 为job设置Mapper类
        job.setCombinerClass(IntSumReducer.class); // 为job设置Combiner类
        job.setReducerClass(IntSumReducer.class);// 为job设置Reduce类
        job.setOutputKeyClass(Text.class); // 设置输出key的类型
        job.setOutputValueClass(IntWritable.class);// 设置输出value的类型
        for (int i = 0; i < otherArgs.length - 1; ++i) {
            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));// 为map-reduce任务设置InputFormat实现类
                                                                        // 设置输入路径
            ;// 为map-reduce任务设置OutputFormat实现类 设置输出路径
        }

        FileOutputFormat.setOutputPath(job, new Path(
                otherArgs[otherArgs.length - 1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

FIXLS

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
MapReduce实现基本SQL操作的原理-join和group by，以及Dinstinct

感谢作者做的那么清晰易懂http://blog.csdn.net/sn_zzy/article/details/43446027Group By原理 map阶段把需要group by的多个字段组合变成一个key reduce字段对组合的新key进行count map阶段不变 reduce阶段相同的id不进行count只保留一个然后再count
复制链接

扫一扫