MapReduce以及第一个案例wordcount

最新推荐文章于 2023-10-02 01:04:25 发布

我要变成万人迷

最新推荐文章于 2023-10-02 01:04:25 发布

阅读量182

点赞数

文章标签： mapreduce

本文链接：https://blog.csdn.net/qq_43668119/article/details/105146278

版权

MapReduce
处理海量数据怎么计算的问题
Mapreduce是一种思想，也是一种计算框架
MapReduce给我们的规范：（框架）
1.思想的规范：
map：映射 key-value
把数据划分为最小的数据单元。
什么是最小》按照实际的业务逻辑划分最小单位。
reduce：合并
按key相同的，value值合并。
合并的两种方案：1.每一组分别合并
2.整体进行合并。
2.代码的规范
Mapper类
把数据划分为最小的数据单元。，以键值对展示的具体代码实现
Reducer
按Key相同，value值合并。以键值对展示的具体代码实现
Driver
一般是主方法，指定一些Mapper和Reducer的一些特有的一些情况。
比如输出输入文件输入输出泛型等。
wordCount 案例：
思路：
1.必须有原文件（很多行数据，每一行都有很多单词用空格分开）
2.每一行都会执行一次map操作
（对每一行数据都进行split分割，形成单词的数组）
注意:所有行执行完操作之后才能执行reduce操作
3.把分割出来的数组循环，在循环体内把每个单词以键值对（key，1）向下传递
4.reduce接收的数据样式是（key，1，1，1，2）
5.对序列里的数据求和。
6.把结果写出到文件中。
代码：

     package mapReduce.wordCount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
/*
Mapper<LongWritable,Text,Text,IntWritable>
注意：所有的泛型一定是可以序列化的（与java的序列化原理一样，表现形式不同）
文件作为mapper的输入，最小的数据单元以键值对的形式作为输出。
LongWritable：文件的偏移量
       偏移量是指能控制文件中每一行数据开头的数字，如第一行有10个字符，那么第二行
       的开头就是11.
Text：文件每行的实际内容。
Text：输出时候的实际内容（这里指的是单词）
IntWritable：每个单词对应的数字。
 */

/*
Mapper中的方法：
 setup：任务开始时，最先开始执行的程序，仅执行一次，相当于初始化一些内容，一般不会重写
 map;每来一个键值对，都需要执行一次map方法，划分最小数据单元的操作就在这里执行。一般需要重写。
 cleanUp：任务结束时，最后执行的程序，仅执行一次，相当于做一些收尾的工作，一般不重写
 run:涉及一些核心内容，没有涉及特殊情况不重写
 就像经典的理论：打开冰箱，将每一头猪依次放进去。最后关上冰箱。
 */
public class WordCount extends Mapper<LongWritable,Text,Text,IntWritable>{
    Text outkey = new Text();
    IntWritable outvalue = new IntWritable();
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //把从文件读出的每行内容变成String类型
        String line = value.toString();
        //把每行内容划分成每一个独立的单词
        String[] words = line.split(" ");
        //循环遍历数组，把数组中的每个单词以（key，1）的形式输出到reduce。
        for(String word : words){
           // context.write(new Text(word),new IntWritable(1));//第一种方式
            outkey.set(word);//第二种方式
            outvalue.set(1);
            context.write(outkey,outvalue);
        }

    }
}

package mapReduce.wordCount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class WordCountReducer extends Reducer<Text,IntWritable,Text,IntWritable>{
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        //求和的初始值。
        int sum = 0;
        //每一个单词，把带进来的序列化数字求和
        for(IntWritable value:values){
            sum+=value.get();
        }
        //把最终文件写入。
        context.write(key,new IntWritable(sum));
    }
}

package mapReduce.wordCount;


import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.FileInputStream;
import java.io.IOException;


public class WordCountDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        //创建一个job的实例
        Job job = Job.getInstance();

        //4设置4个输出泛型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        //3设置3个类（）
        job.setJarByClass(WordCountDriver.class);
        job.setMapperClass(WordCount.class);
        job.setReducerClass(WordCountReducer.class);
        //2设置两个路径（输入和输出）
        FileInputFormat.setInputPaths(job,new Path("file:///C:/WordCount/int.txt"));
        FileOutputFormat.setOutputPath(job,new Path("file:///C:/WordCount/out.txt"));
        //1 提交任务
        job.waitForCompletion(true);
    }
}

代码中的问题：
1.输入输出路径：
可以像上面代码WordCountDriver中一样，将路径写死，但是当我们打jar包放在linux中执行的时候，不方便，所以，可以采用参数的方式写路径，打开run–》Edit Configurations—》Program arguments,在其中写路径，注意：路径之间用空格隔开
在这里插入图片描述

改了之后输入输出路径的代码：

  FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

2.通过路径手动删除的问题
如果上面代码中的输出路径存在，那么，用代码直接删除。

 /判断路径是否存在，如果存在，就删除。
        //创建一个输出路径
        Path outputpath = new Path(args[1]);
        //通过输出路径创建一个文件系统的流
        FileSystem fs = FileSystem.get(new URI(outputpath.toString()),new Configuration());
        //判断当前文件系统是否存在这个输出路径
        if(fs.exists(outputpath)){
            //如果存在，就删除。
            fs.delete(outputpath,true);
        }

3.控制台信息的显示问题：
在resource下放log4j.properties文件。
4.对driverClass的改写，并且采用内部类的形式改写代码：

package mapReduce.wordCount;

import mapReduce.wordCount.common.SubmitJobUtil;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import javax.security.auth.login.Configuration;
import java.io.IOException;
import java.net.URI;

public class WordCountApp {
    public static class wcMapper extends Mapper<LongWritable,Text,Text,IntWritable>{
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            String[] words =line.split(" ");
            for(String word : words){
                context.write(new Text(word),new IntWritable(1));
            }
        }
    }
    public static class wcReducer extends Reducer<Text,IntWritable,Text,IntWritable>{
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int sum=0;
            for(IntWritable value: values){
                sum += value.get();
            }
            context.write(key,new IntWritable(sum));
        }
    }

    public static void main(String[] args) throws Exception {
//        Job job = Job.getInstance();
//
//        job.setMapOutputKeyClass(Text.class);
//        job.setMapOutputValueClass(IntWritable.class);
//        job.setOutputKeyClass(Text.class);
//        job.setOutputValueClass(IntWritable.class);
//
//        job.setJarByClass(WordCountApp.class);
//        job.setMapperClass(wcMapper.class);
//        job.setReducerClass(wcReducer.class);
//
//        Path outpath = new Path(args[1]);
//        FileSystem fs = FileSystem.get(new URI(outpath.toString()),new org.apache.hadoop.conf.Configuration());
//        if(fs.exists(outpath)){
//            fs.delete(outpath,true);
//        }
//
//        FileInputFormat.setInputPaths(job,new Path(args[0]));
//        FileOutputFormat.setOutputPath(job,new Path(args[1]));
//
//        job.waitForCompletion(true);
        SubmitJobUtil.submitJob(WordCountApp.class,args);
    }
}

package mapReduce.wordCount.common;

import mapReduce.wordCount.WordCountApp;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.lang.reflect.ParameterizedType;
import java.lang.reflect.Type;
import java.net.URI;

public class SubmitJobUtil {

    public static void submitJob(Class driverClass, String[] args) throws Exception {
        Job job = Job.getInstance();

      Class[] innerClasses = driverClass.getClasses();
        job.setJarByClass(driverClass);
        for(Class innerClass:innerClasses){
            //获取当前内部类父类的所有泛型
            ParameterizedType paraType = (ParameterizedType)innerClass.getGenericSuperclass();
            //把获得的所有泛型参数化，变成数组的形式方便使用
            Type[] types = paraType.getActualTypeArguments();
            //获得Mapper或者reducer类的输出key的类型。
            Type outKeyType = types[2];
            //获得Mapper或者reducer类的输出value的类型
            Type outValueType = types[3];
            //判断当前的内部类中哪一个类是mapper，哪一个是reducer。
            if (Mapper.class.isAssignableFrom(innerClass)){
                job.setMapperClass(innerClass);
                job.setMapOutputKeyClass(Class.forName (outKeyType.getTypeName()));
                job.setMapOutputValueClass(Class.forName (outValueType.getTypeName ()));
            }else if(Reducer.class.isAssignableFrom(innerClass)){
                job.setReducerClass(innerClass);
                job.setOutputKeyClass(Class.forName (outKeyType.getTypeName()));
                job.setOutputValueClass(Class.forName (outValueType.getTypeName ()));
            }
        }

        Path outpath = new Path(args[1]);
        FileSystem fs = FileSystem.get(new URI(outpath.toString()),new org.apache.hadoop.conf.Configuration());
        if(fs.exists(outpath)){
            fs.delete(outpath,true);
        }

        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        job.waitForCompletion(true);

    }
}

如果在 job.setMapOutputKeyClass(Class.forName (outKeyType.getTypeName()));中显示没有getTypeName（）这个方法，请把jdk改成1.8版本试试。
在这里插入图片描述