05-hadoop03之MapReduce

最新推荐文章于 2024-08-26 23:38:02 发布

YuPangZa

最新推荐文章于 2024-08-26 23:38:02 发布

阅读量75

点赞数

文章标签： mapreduce 大数据

本文链接：https://blog.csdn.net/qq_43819048/article/details/132621563

版权

一、补充

1、如何实现自定义序列化

1）实现Writable 接口 2）实现WritableComparable接口

如果自定义的数据类型，是当做Key值，因为Key值必须能排序才行，所以需要实现WritableComparable接口，当做Value值，直接实现Writable接口即可。

手机号码Key PhoneFlowWritable 是Value

在MR程序中，只有能排序的数据类型才能当做Key。因为Key需要排序。

Map端很多地方输出的结果都是有序的，什么有序？--》 key值有序。

public interface WritableComparable<T> extends Writable, Comparable<T> {
}

package com.bigdata.day12;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 * @Author laoyan
 * @Description TODO
 * @Date 2022/8/2 9:39
 * @Version 1.0
 *
 *  这种数据类型，可以当做Key值，因为它具有排序的功能，当然也可以当做Value

     自定义数据类型的时候什么情况下实现 Writable？
     什么情况下实现WritableComparable？
     就看你自定义的类型当不当做Key.
 */
public class AdminWritable implements WritableComparable<AdminWritable> {

    private String name;
    private int age;

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public int getAge() {
        return age;
    }

    public void setAge(int age) {
        this.age = age;
    }

    @Override
    public int compareTo(AdminWritable admin) {
        // 根据年龄进行排序
        return this.age - admin.getAge();
    }

    @Override
    public void write(DataOutput out) throws IOException {
         // 此处省略
        out.writeUTF(name);
        out.writeInt(age);
    }

    @Override
    public void readFields(DataInput in) throws IOException {

        this.name = in.readUTF();
        this.age = in.readInt();
    }
}

2、优化一段代码

遇到的问题：

解决方案：

就是重复利用new Text() new PhoneFlowWritable（）对象即可。

优化过之后的代码

package com.bigdata.phoneflow;

import com.bigdata.WordCountMapper;
import com.bigdata.WordCountPartitioner;
import com.bigdata.WordCountReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.Iterator;

/**
 * @Author laoyan
 * @Description TODO
 * @Date 2022/8/1 14:33
 * @Version 1.0
 */
class PhoneFlowMapper extends Mapper<LongWritable, Text,Text,PhoneFlowWritable> {

    // 该方法，只执行一次，一般的初始化操作都会在这个方法里面完成
    Text text = null;
    PhoneFlowWritable phoneFlowWritable = null;
    @Override
    protected void setup(Mapper<LongWritable, Text, Text, PhoneFlowWritable>.Context context) throws IOException, InterruptedException {
        text = new Text();
        phoneFlowWritable = new PhoneFlowWritable();
    }

    // 将每一句话，都变为   手机号码 -->  PhoneFlowWritable对象
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, PhoneFlowWritable>.Context context) throws IOException, InterruptedException {

        //1363157986072 	18320173382	84-25-DB-4F-10-1A:CMCC-EASY	120.196.100.99	input.shouji.sogou.com	搜索引擎	21	18	9531	2412	200
        String line = value.toString();
        String[] arr = line.split("\\s+");// \s 表示一个空白符，+ 表示出现一次到多次
        String phone = arr[1];
        int upFlow = Integer.parseInt(arr[arr.length-3]);
        int downFlow = Integer.parseInt(arr[arr.length-2]);

        text.set(phone);
        phoneFlowWritable.setPhone(phone);
        phoneFlowWritable.setUpFlow(upFlow);
        phoneFlowWritable.setDownFlow(downFlow);
        context.write(text ,phoneFlowWritable);
    }
}
//   手机号 --> 流量数据PhoneFlowWritable      手机号码 --> 统计的结果
class PhoneFlowReducer extends Reducer<Text,PhoneFlowWritable,Text,Text> {

    Text value = null;
    @Override
    protected void setup(Reducer<Text, PhoneFlowWritable, Text, Text>.Context context) throws IOException, InterruptedException {
        value =new Text();
    }

    // reduce 会将相同的key值，汇总在一起，将value值合并到一个迭代器中
    @Override
    protected void reduce(Text key, Iterable<PhoneFlowWritable> values, Reducer<Text, PhoneFlowWritable, Text, Text>.Context context) throws IOException, InterruptedException {

        int upFlowNum = 0;
        int downFlowNum = 0;
        Iterator<PhoneFlowWritable> iterator = values.iterator();
        while(iterator.hasNext()){
            PhoneFlowWritable phoneFlowWritable = iterator.next();
            upFlowNum += phoneFlowWritable.getUpFlow();
            downFlowNum += phoneFlowWritable.getDownFlow();
        }
        StringBuffer sb = new StringBuffer();
        sb.append("手机号"+key+"流量统计：");
        sb.append("上行流量是:"+upFlowNum);
        sb.append("下行流量是:"+downFlowNum);
        sb.append("总的流量是:"+(upFlowNum + downFlowNum));

        value.set(sb.toString());
        context.write(key,value);

    }
}



public class PhoneFlowDriver {

    public static void main(String[] args) throws Exception{

        Configuration configuration = new Configuration();
        // 使用本地的文件系统，而不是hdfs
        configuration.set("fs.defaultFS","file:///");
        // 使用本地的资源（CPU,内存等）, 也可以使用yarn平台跑任务
        configuration.set("mapreduce.framework.name","local");
        Job job = Job.getInstance(configuration, "手机流量统计");
        // map任务的设置
        job.setMapperClass(PhoneFlowMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(PhoneFlowWritable.class);

        // reduce任务的设置
        job.setReducerClass(PhoneFlowReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        // 设置要统计的数据的路径，结果输出路径
        FileInputFormat.setInputPaths(job,new Path("mr01/phoneFlow/input"));
        // ouput文件夹一定不要出现，否则会报错
        FileOutputFormat.setOutputPath(job,new Path("mr01/phoneFlow/output"));
        // 等待任务执行结束
        boolean b = job.waitForCompletion(true);
        // 此处是一定要退出虚拟机的
        System.exit(b ? 0:-1);
    }
}

waitForCompletion方法的boolean参数verbose为true表明要打印运行进度，为false就只是等待job运行结束，不打印运行日志。

二、统计最高温度

0188010010999992000010100004+70930-008670FM-12+0009ENJA V0202101N002110021019N0025001N1+00101+00031098181ADDAA106004191AY181061AY251061GF108991081061002501999999MA1999999098061MD1510071+9999MW1501REMSYN088AAXX  01004 01001 11325 82104 10010 20003 39806 49818 55007 60041 75085 886// 333 91119;
    

需求：求每一年的最高温度
数据格式的说明：
1、每一行的 【15，18】年份
2、87位指的是 温度的正负
3、【88，91】 指的是温度，如果温度是9999表示无效
4、92位是校验位，如果是0,1,4,5,9 这几个值表示温度有效。

思路

Map 端：   Key值是年份   温度
Reduce端： 到reduce长什么样子呢？  key  [3243,3434,34343,34343]
   根据集合中的数据，求最大值，写出到磁盘即可。

package com.bigdata.temp;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

class TempMaxMapper extends Mapper<LongWritable, Text,Text, IntWritable>{

    Text year = null;
    IntWritable temp = null;
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {

        year = new Text();
        temp = new IntWritable();
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        // 最终的结果   年份  温度
        String line = value.toString();
        // 获取年份
        String _year = line.substring(15, 19);
        // 获取温度以及符号
        int _temp = Integer.parseInt(line.substring(87, 92));
        String validateCode = line.substring(92, 93);//获取校验码
        if(_temp  == 9999 || validateCode.matches("[^01459]")){
            return ; // 表示代码终止
        }
        year.set(_year);
        temp.set(_temp);
        context.write(year,temp);
    }
}

class TempMaxReducer extends Reducer<Text, IntWritable,Text, Text>{

    Text text = null;

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        text = new Text();
    }

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
       // 1901 [xxx,xxx,xxx]
        int maxTemp = Integer.MIN_VALUE; // 因为设置为哪个数字都不合适
        for (IntWritable temp:values) {
            // 求两个值的最大值
            maxTemp = Integer.max(maxTemp,temp.get());
        }


        text.set("这一年的最高温是"+maxTemp);
        context.write(key,text);
    }
}


public class TempMaxDriver {


    public static void main(String[] args) throws Exception {
        Configuration configuration = new Configuration();
        // 使用本地的文件系统，而不是hdfs
        configuration.set("fs.defaultFS","file:///");
        // 使用本地的资源（CPU,内存等）, 也可以使用yarn平台跑任务
        configuration.set("mapreduce.framework.name","local");
        Job job = Job.getInstance(configuration, "统计最高温");

        // 指定 map
        job.setMapperClass(TempMaxMapper.class);
        // hello 1
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        // 指定 reduce
        job.setReducerClass(TempMaxReducer.class);
        // hello 5
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        // 此处也可以使用绝对路径
        FileInputFormat.setInputPaths(job,"../WordCount/mr03/input/");
        FileOutputFormat.setOutputPath(job,new Path("../WordCount/mr03/output"));

        // 这个true表示将日志打印给前端，false 表只执行
        boolean result = job.waitForCompletion(true);

        // 返回结果如果为true表示任务成功了，正常退出，否则非正常退出
        System.exit(result?0:-1);
    }
}

三、求TopN案例

求每个人，他打分最高的五部电影。

1、有一种数据的结构，叫JSON (JavaScript)

JSON: JavaScript Object Notation（JavaScript 对象标记法）

H5: 编写页面的

Java：编写后端代码的

H5页面和Java之间是要传递数据的，传递的数据可以有很多种形式，xml和JSON。

2、如何表示

JSON 语法衍生于 JavaScript 对象标记法语法：

数据在名称/值对中   "name":"老闫"
数据由逗号分隔      "name":"老闫","age":30
花括号容纳对象     {"name":"老闫","age":30}
方括号容纳数组     [{"name":"老闫","age":30},{"name":"老闫2","age":10}]

rating.json

{"movie":"1193","rate":"5","timeStamp":"978300760","uid":"1"}
{"movie":"661","rate":"3","timeStamp":"978302109","uid":"1"}
{"movie":"914","rate":"3","timeStamp":"978301968","uid":"1"}
{"movie":"3408","rate":"4","timeStamp":"978300275","uid":"1"}
{"movie":"2355","rate":"5","timeStamp":"978824291","uid":"1"}
{"movie":"1197","rate":"3","timeStamp":"978302268","uid":"1"}
{"movie":"1287","rate":"5","timeStamp":"978302039","uid":"1"}
{"movie":"2804","rate":"5","timeStamp":"978300719","uid":"1"}
{"movie":"594","rate":"4","timeStamp":"978302268","uid":"1"}

3、导入工具包
因为我们想将json数据转换为Java 对象 -- Jackson

  <!--导入jackson工具包-->
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-core</artifactId>
            <version>2.9.5</version>
        </dependency>
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-annotations</artifactId>
            <version>2.9.5</version>
        </dependency>
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-databind</artifactId>
            <version>2.9.5</version>
        </dependency>

mavan的一个小用法，先声明版本号，再使用版本号：

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  
  <groupId>com.bigdata</groupId>
  <artifactId>HadoopDay11</artifactId>
  <version>1.0-SNAPSHOT</version>
  <!--设置打包方式，为jar
       此处可以设置三个值：
       jar   纯java项目，打包jar
       war   web项目，打包war
       pom   maven的父子工程，父工程就是pom
    -->
  <packaging>jar</packaging>
  
  <properties>
    <maven.compiler.source>8</maven.compiler.source>
    <maven.compiler.target>8</maven.compiler.target>
    <!--声明一个变量-->
    <jackson.version>2.9.5</jackson.version>
    <hadoop.version>3.3.1</hadoop.version>
  </properties>
  
  <dependencies>
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-common</artifactId>
      <version>${hadoop.version}</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-client</artifactId>
      <version>${hadoop.version}</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs -->
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-hdfs</artifactId>
      <version>${hadoop.version}</version>
    </dependency>
    
    <dependency>
      <groupId>ch.qos.logback</groupId>
      <artifactId>logback-classic</artifactId>
      <version>1.0.6</version>
    </dependency>
    
    <!--导入jackson工具包-->
    <dependency>
      <groupId>com.fasterxml.jackson.core</groupId>
      <artifactId>jackson-core</artifactId>
      <version>${jackson.version}</version>
    </dependency>
    <dependency>
      <groupId>com.fasterxml.jackson.core</groupId>
      <artifactId>jackson-annotations</artifactId>
      <version>${jackson.version}</version>
    </dependency>
    <dependency>
      <groupId>com.fasterxml.jackson.core</groupId>
      <artifactId>jackson-databind</artifactId>
      <version>${jackson.version}</version>
    </dependency>
  </dependencies>
  
</project>

4、编写一个实体，用于映射json数据

package com.bigdata.day12.topn;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
* @Author laoyan
* @Description TODO
* @Date 2022/8/2 14:26
* @Version 1.0
*/
public class RatingWritable implements Writable {
    
    private String movie;
    private int rate;
    private String timeStamp;
    private String uid;
    
    public RatingWritable() {
    }
    
    public RatingWritable(String movie, int rate, String timeStamp, String uid) {
        this.movie = movie;
        this.rate = rate;
        this.timeStamp = timeStamp;
        this.uid = uid;
    }
    
    public String getMovie() {
        return movie;
    }
    
    public void setMovie(String movie) {
        this.movie = movie;
    }
    
    public int getRate() {
        return rate;
    }
    
    public void setRate(int rate) {
        this.rate = rate;
    }
    
    public String getTimeStamp() {
        return timeStamp;
    }
    
    public void setTimeStamp(String timeStamp) {
        this.timeStamp = timeStamp;
    }
    
    public String getUid() {
        return uid;
    }
    
    public void setUid(String uid) {
        this.uid = uid;
    }
    
    @Override
    public String toString() {
        return "RatingWritable{" +
            "movie='" + movie + '\'' +
            ", rate=" + rate +
            ", timeStamp='" + timeStamp + '\'' +
            ", uid='" + uid + '\'' +
            '}';
    }
    
    @Override
    public void write(DataOutput out) throws IOException {
        
        out.writeUTF(movie);
        out.writeInt(rate);
        out.writeUTF(timeStamp);
        out.writeUTF(uid);
    }
    
    @Override
    public void readFields(DataInput in) throws IOException {
        
        movie = in.readUTF();// ctrl + d
        rate = in.readInt();
        timeStamp = in.readUTF();
        uid = in.readUTF();
    }
}

修改成如下代码即可：

完整代码展示：

package com.bigdata.topN;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.codehaus.jackson.map.ObjectMapper;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;

class TopNMapper extends Mapper<LongWritable, Text, IntWritable,MovieWritable>{

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        /**
         *  读取一行数据，得到一个json ，解析json ，获取 uid 和 name  和评分
         */
        String jsonStr = value.toString();
        // 如何将一个json解析呢？ 可以使用如下工具类： jackson fastjson gson 等

        ObjectMapper objectMapper = new ObjectMapper();
        MovieWritable movie = objectMapper.readValue(jsonStr, MovieWritable.class);
        System.out.println("mapper端数据："+movie);
        context.write(new IntWritable(movie.getUid()),movie);
    }
}

class TopNReducer extends Reducer<IntWritable,MovieWritable,Text, NullWritable>{


    @Override
    protected void reduce(IntWritable key, Iterable<MovieWritable> values, Context context) throws IOException, InterruptedException {
        //  uid  后面是他评价的所有电影信息

        // 后面开始进行排序，取前五名
        List<MovieWritable> list = new ArrayList<MovieWritable>();
        for (MovieWritable movie:values) {
            // 不能在这个地方直接add ,否则数据会重复
            // list.add(movie);
            MovieWritable rate = new MovieWritable();
            rate.setRate(movie.getRate());
            rate.setMovie(movie.getMovie());
            rate.setTimeStamp(movie.getTimeStamp());
            rate.setUid(movie.getUid());
            list.add(rate);
        }
        System.out.println(list);
        // 排好了顺序
        Collections.sort(list, new Comparator<MovieWritable>() {
            @Override
            public int compare(MovieWritable m1, MovieWritable m2) {
                return m2.getRate() - m1.getRate();
            }
        });

        System.out.println(list);

        // 取前五个
        int length = Math.min(5,list.size());
        // StringBuffer 和 StringBuilder      StringBuffer是线程安全的
        StringBuffer sb =new StringBuffer(key.get()+"最喜欢的五部的电影是：\n");
        for (int i = 0; i < length; i++) {
            MovieWritable movie = list.get(i);
            sb.append(movie.getMovie()+",分数为："+movie.getRate()+"\n");
        }
        context.write(new Text(sb.toString()),NullWritable.get());

    }
}
public class TopNDriver {

    public static void main(String[] args) throws Exception{

        Configuration configuration = new Configuration();
        // 使用本地的文件系统，而不是hdfs
        configuration.set("fs.defaultFS","file:///");
        // 使用本地的资源（CPU,内存等）, 也可以使用yarn平台跑任务
        configuration.set("mapreduce.framework.name","local");
        Job job = Job.getInstance(configuration, "电影排名");

        // 指定 map
        job.setMapperClass(TopNMapper.class);
        // hello 1
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(MovieWritable.class);

        job.setReducerClass(TopNReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        // 此处也可以使用绝对路径
        FileInputFormat.setInputPaths(job,"../WordCount/mr05/input/");
        FileOutputFormat.setOutputPath(job,new Path("../WordCount/mr05/output"));

        boolean result = job.waitForCompletion(true);

        // 返回结果如果为true表示任务成功了，正常退出，否则非正常退出
        System.exit(result?0:-1);
    }
}

结果展示：

四、Yarn

1、概念

Hadoop三大件：HDFS、MapReduce、Yarn

Yarn其实就是一个类似于操作系统一样的东西。

Yarn是MapReduce运行的环境，Yarn可以管理程序运行所需要的东西（内存，CPU,带宽等资源）

Yarn诞生于Hadoop，但是现在已经脱离了Hadoop，变成了一个独立的软件，系统。

2、Yarn的组成部分

我们的Yarn,其实有两大部分组成：

必须清除

1、ResourceManager (BOSS):  1个
 他用来管理整个的Yarn平台，里面有一个资源调度器。
2、NodeManager (各个机器上的主管)  多个
   听从我们的ResouceManager的调遣。是每一台电脑的管家。
3、Container(容器)
   每一个NodeManager中，有一个或者多个这样的容器。是包含了一些资源的封装（CPU,内存，硬盘等），类似于我们熟悉的虚拟机。
4、AppMaster (项目经理)
   每一个MapReduce任务启动提交后，会有一个对应的AppMaster。这个主要作用是负责整个job任务的运行。

3、Yarn如何进行配置和搭建
/opt/installs/hadoop/etc/hadoop 文件夹下：

mapred-site.xml

<configuration>
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
    </property>
</configuration>

指定mapreduce运行平台为yarn

yarn-site.xml

<!--指定resourceManager启动的主机为第一台服务器-->
    <property>
        <name>yarn.resourcemanager.hostname</name>
        <value>bigdata01</value>
    </property>


    <!--配置yarn的shuffle服务-->
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value> 
    </property>

检查hadoop-env.sh 中是否配置了权限：

export YARN_RESOURCEMANAGER_USER=root
export YARN_NODEMANAGER_USER=root

继续配置：为了防止报AppMaster的错误，需要如下配置

yarn-site.xml

<property>
        <name>yarn.application.classpath</name>
        <value>/opt/installs/hadoop/etc/hadoop:/opt/installs/hadoop/share/hadoop/common/lib/*:/opt/installs/hadoop/share/hadoop/common/*:/opt/installs/hadoop/share/hadoop/hdfs:/opt/installs/hadoop/share/hadoop/hdfs/lib/*:/opt/installs/hadoop/share/hadoop/hdfs/*:/opt/installs/hadoop/share/hadoop/mapreduce/*:/opt/installs/hadoop/share/hadoop/yarn:/opt/installs/hadoop/share/hadoop/yarn/lib/*:/opt/installs/hadoop/share/hadoop/yarn/*</value> 
    </property>

获取classpath的值：

分发mapred-site.xml & yarn-site.xml 到另外两台电脑上。

cd /opt/installs/hadoop/etc/hadoop/

xsync.sh mapred-site.xml yarn-site.xml

启动和停止yarn平台：

启动： start-yarn.sh
停止： stop-yarn.sh

也可以使用web访问一下：

跟ResourceManager电脑的IP保持一致

http://192.168.233.128:8088

4、关于启动和停止的命令

5、使用yarn平台进行wordCount计算

将一个wc.txt 上传至hdfs平台，然后通过yarn平台进行计算

数据

hadoop spark hello hadoop
spark hello flink world
scala python python scala

运行hadoop自带的workCount:

hadoop jar /opt/installs/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.1.jar wordcount /home/wc.txt /home/output

我们还可以在Yarn平台上查看运行的情况：

以上这个案例使用的就是yarn运行，数据必须在hdfs上，yarn也必须启动。代码虽然在本地，但是也会上传到hdfs的。

五、MapReduce任务有三种运行开发模式

1、local模式

数据在本地，代码也在本地，使用本机的电脑的资源运行我们的MR

输入和输出路径指的都是本地路径，运行时耗费的资源也是本地资源。

2、local模式2

数据在hdfs上，代码在本地，使用本机的电脑的资源运行我们的MR

  System.setProperty("HADOOP_USER_NAME","root");    
        Configuration configuration = new Configuration();
        
        configuration.set("fs.defaultFS","hdfs://192.168.32.128:9820");
        // 使用本地的资源（CPU,内存等）, 也可以使用yarn平台跑任务
        configuration.set("mapreduce.framework.name","local");

这个里面的输入和输出路径指的是hdfs上的路径。

3、Yarn模式

数据在hdfs上，代码在yarn上。

  System.setProperty("HADOOP_USER_NAME","root");    
        Configuration configuration = new Configuration();
        
        configuration.set("fs.defaultFS","hdfs://192.168.32.128:9820");
        
        configuration.set("mapreduce.framework.name","yarn");

        // 跨平台任务提交打开
        configuration.set("mapreduce.app-submission.cross-platform", "true");

案例：使用Yarn运行自己编写的WordCount:

修改代码如下：

package com.bigdata.day12.workcount;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * @Author laoyan
 * @Description TODO
 * @Date 2022/8/1 10:02
 * @Version 1.0
 */
public class WordCountDriver2 {
    public static void main(String[] args) throws Exception {

        System.setProperty("HADOOP_USER_NAME","root");
        Configuration configuration = new Configuration();
        // 使用本地的文件系统，而不是hdfs
        configuration.set("fs.defaultFS","hdfs://192.168.32.128:9820");
        // 使用本地的资源（CPU,内存等）, 也可以使用yarn平台跑任务
        configuration.set("mapreduce.framework.name","yarn");
        //要解决客户端在windows上运行，同时将MapTask和ReduceTask提交到linux集群上运行产生的冲突问题，就要修改下mapReduce的一些默认配置
        configuration.set("mapreduce.app-submission.cross-platform", "true");
        Job job = Job.getInstance(configuration, "老闫在yarn上运行workCount");
        // map任务的设置

        // 这句话不要忘记添加
        job.setJarByClass(WordCountDriver2.class);

        job.setMapperClass(WordCountMapper2.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        // 指定分区的类是哪一个
        job.setPartitionerClass(WordCountPartitioner2.class);
        // 还要执行 reduce的数量  因为一个reduce 就会产生一个结果文件
     

        // reduce任务的设置
        job.setReducerClass(WordCountReducer2.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        // 设置要统计的数据的路径，结果输出路径
        FileInputFormat.setInputPaths(job,new Path(args[0]));
        // ouput文件夹一定不要出现，否则会报错
        FileOutputFormat.setOutputPath(job,new Path(args[1]));
        // 等待任务执行结束
        boolean b = job.waitForCompletion(true);
        // 此处是一定要退出虚拟机的
        System.exit(b ? 0:-1);
    }
}

将程序达成jar包：

将我们的大好的Jar包上传至linux服务器，运行该jar包：

hadoop jar WC.jar com.bigdata.day12.workcount.WordCountDriver2 /input /oottpp2

jar包的名字最好短一点
com.bigdata.day12.workcount.WordCountDriver2   这个是Main方法所在的类的全路径
/input   hdfs上文件的路径
/oottpp2  hdfs上数据的统计的输出路径

如果出现web端查看错误：

记得dfs-site.xml中添加如下配置，重启集群：

<property>
    <name>dfs.webhdfs.enabled</name>
    <value>true</value>
</property>

接着在本地windows系统的hosts文件中，添加映射
配置浏览器所在系统的 hosts 文件
windows：
在 C:\Windows\System32\drivers\etc\hosts 末尾增加内容（Hadoop集群中各节点及主机名的映射）

六、练习

1、数据清洗

# 数据清洗概念
  通常情况下，大数据平台获得原始数据文件中，存在大量无效数据和缺失数据，需要再第一时间，对数据进行清洗，获得符合后续处理需求的数据内容和格式
  
# 需求
  对手机流量原始数据，将其中的手机号为"null"和不完整的数据去除

数据格式：

# 源数据
id             手机号        手机mac                 ip地址                  上传    下载  HTTP状态码
1363157985066  13726230503  00-FD-07-A4-72-B8:CMCC  120.196.100.82  24  27  2481  24681  200
1363157995052  13826544101  5C-0E-8B-C7-F1-E0:CMCC  120.197.40.4  4  0  264  0  200
1363157991076  13926435656  20-10-7A-28-CC-0A:CMCC  120.196.100.99  2  4  132  1512  200
1363154400022  13926251106  5C-0E-8B-8B-B1-50:CMCC  120.197.40.4  4  0  240  0  200
1363157985066  13726230503  00-FD-07-A4-72-B8:CMCC  120.196.100.82  24  27  2481  24681  200
1363157995052  13826544101  5C-0E-8B-C7-F1-E0:CMCC  120.197.40.4  4  0  264  0  200
1363157991076  13926435656  20-10-7A-28-CC-0A:CMCC  120.196.100.99  2  4  132  1512  200
1363154400022  13926251106  5C-0E-8B-8B-B1-50:CMCC  120.197.40.4  4  0  240  0  200
1363157995052  13826544109  5C-0E-8B-C7-F1-E0:CMCC  120.197.40.4  4  0
1363157995052  null  5C-0E-8B-C7-F1-E0:CMCC  120.197.40.4  4  0  240  0  200
1363157991076  13926435659  20-10-7A-28-CC-0A:CMCC  120.196.100.99  2  4  null  null  null

# 期望结果【删除其中手机号不符合要求，上传流量确实和下载流量确实的数据，并仅保留手机号 上传流量 下载流量。】
13726230503  2481  24681
13826544101  264  0
13926435656  132  1512
13926251106  240  0
13726230503  2481  24681
13826544101  264  0
13926435656  132  1512
13926251106  240  0

编码提示：

# 重点：
  MapReduce整个流程中可以取消reduce阶段的程序执行，map输出的会直接作为结果输出到HDFS文件中。
  
# 编码实现
1. 删除job中有关reducer的相关设置：reducer类和输出的key value类型。
2. 手动设置reducetask的个数为0
   job.setNumReduceTasks(0);//取消reducer

package com.bigdata;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

class CleanLogMapper extends Mapper<LongWritable, Text,Text,Text>{

    Text text = null;
    Text text2 = null;
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {

        text = new Text();
        text2 = new Text();
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        String[] split = line.split("\\s+");
        String phone = split[1];
        String upFlow = split[split.length-3];
        String downFlow = split[split.length-2];
        if(phone.equals("null") || upFlow.equals("null") || downFlow.equals("null") || split.length != 9){
            return ;
        }
        text.set(phone);
        text2.set(upFlow+" "+downFlow);
        context.write(text,text2);
    }
}
public class CleanLogDriver {

    public static void main(String[] args) throws  Exception{

        Configuration conf = new Configuration();
        conf.set("fs.defaultFS","file:///");
        conf.set("mapreduce.framework.name","local");
        Job job = Job.getInstance(conf, "清理数据");
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        // 为什么要加这句话： 因为我们没有写reduce 但是默认还是会走reduce,所以需要添加reduce的数量为 0
        job.setNumReduceTasks(0);

        job.setMapperClass(CleanLogMapper.class);
        // 一般 不写，除非在yarn上运行，必须写，就是DriverClass
        job.setJarByClass(CleanLogDriver.class);

        // 这个地方跟以前可以不一样
        FileInputFormat.addInputPath(job,new Path("../WordCount/mr06/input"));
        FileOutputFormat.setOutputPath(job,new Path("../WordCount/mr06/output"));

        // 每运行一会儿就追一下日志，查看进度的
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : -1);
    }
}

2、自定义分区案例

案例之·学生成绩统计分析

# 将学生成绩，按照各科成绩降序排序，各个科目成绩单独输出。

数据如下：

# 自定义partition
将下面数据分区处理：
人名  科目 成绩
张三  语文  10
李四  数学  30
王五  语文  20
赵6  英语  40
张三  数学  50
李四  语文  10
张三  英语  70
李四  英语  80
王五  英语  45
王五  数学  10
赵6  数学  10
赵6  语文  100

package com.bigdata.score;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

class ScorePartitonerMapper extends Mapper<LongWritable, Text, Text,Student>{
    Text text = null; // 学科

    //  先按照相同的学科汇总在一起，然后按照成绩排序，排序后成绩倒序输出

    Student student = null;

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {

        // 因为成绩要倒序排序，默认的IntWritable是正序的
        text = new Text();
        student = new Student();
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        // 这个是跳过第一行数据，假如需要跳过第二行，也需要计算第一行的长度
        if(key.get() == 0){
            return ;
        }
        String line = value.toString();
        String[] split = line.split("\\s+");
        String subject = split[1];
        String name = split[0];
        int score = Integer.valueOf(split[2]);

        text.set(subject);

        student.setName(name);
        student.setScore(score);
        student.setSubject(subject);

        context.write(text,student);
    }
}

class ScorePartitoner extends Partitioner<Text,Student>{

    @Override
    public int getPartition(Text subjectText, Student student, int i) {

        String subjectName = subjectText.toString();
        switch (subjectName) {
            case "语文":
                return 0;
            case "数学":
                return 1;
            case "英语":
                return 2;
            default:
                return 3;
        }
    }
}

class Student implements WritableComparable<Student>{

    private String name;
    private int score;
    private String subject;

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public int getScore() {
        return score;
    }

    public void setScore(int score) {
        this.score = score;
    }

    public String getSubject() {
        return subject;
    }

    public void setSubject(String subject) {
        this.subject = subject;
    }

    @Override
    public int compareTo(Student o) {
        return   o.getScore() -this.score;
    }

    @Override
    public void write(DataOutput out) throws IOException {

        out.writeUTF(name);
        out.writeInt(score);
        out.writeUTF(subject);
    }

    @Override
    public void readFields(DataInput in) throws IOException {

        name = in.readUTF();
        score = in.readInt();
        subject = in.readUTF();
    }

    @Override
    public String toString() {
        return name + " " +subject + " " + score ;
    }
}

class ScoreReduer extends Reducer<Text,Student,Text, NullWritable>{
    @Override
    protected void reduce(Text key, Iterable<Student> values, Context context) throws IOException, InterruptedException {

        List<Student> list = new ArrayList<>();
        for (Student stu:values) {
            Student student = new Student();
            student.setSubject(stu.getSubject());
            student.setScore(stu.getScore());
            student.setName(stu.getName());

            list.add(student);
        }

        Collections.sort(list);

        for (Student s:list) {
            Text text = new Text(s.toString());
            context.write(text,NullWritable.get());
        }

    }
}


public class ScorePartitionerDriver {

    public static void main(String[] args) throws  Exception{

        Configuration conf = new Configuration();
        conf.set("fs.defaultFS","file:///");
        conf.set("mapreduce.framework.name","local");
        Job job = Job.getInstance(conf, "成绩分区");
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Student.class);

        job.setMapperClass(ScorePartitonerMapper.class);

        job.setPartitionerClass(ScorePartitoner.class);

        job.setReducerClass(ScoreReduer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        // 为什么要加这句话： 因为我们没有写reduce 但是默认还是会走reduce,所以需要添加reduce的数量为 0
        job.setNumReduceTasks(3);

        // 一般 不写，除非在yarn上运行，必须写，就是DriverClass
        job.setJarByClass(ScorePartitionerDriver.class);

        // 这个地方跟以前可以不一样
        FileInputFormat.addInputPath(job,new Path("../WordCount/mr07/input"));
        FileOutputFormat.setOutputPath(job,new Path("../WordCount/mr07/output"));

        // 每运行一会儿就追一下日志，查看进度的
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : -1);

    }
}

3、排序

默认排序：

案例之·斗鱼主播日志数据按照观众人数升序序排序

# 案例
用户id  观众人数
团团  300
小黑  200
哦吼  400
卢本伟  100
八戒  250
悟空  100
唐僧  100

# 期望结果
卢本伟  100
悟空  100
唐僧  100
小黑  200
八戒  250
团团  300
哦吼  400

提示：

● 默认排序规则
1. 默认排序调用mapper输出key的compareTo方法比较大小，决定排序规则。 
2. 默认升序。
所以以上这个题目，谁是Key，谁是Value呢？

参考答案：

package com.bigdata.paixu01;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

class PaiXuMapper extends Mapper<LongWritable, Text, IntWritable,Text>{

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String s = value.toString();
        String[] split = s.split("\\s+");
        int visitNum = Integer.valueOf(split[1]);
        String name = split[0];
        context.write(new IntWritable(visitNum),new Text(name));
    }
}
public class PaiXu01 {

    public static void main(String[] args) throws Exception {

        Configuration configuration = new Configuration();
        // 使用本地的文件系统，而不是hdfs
        configuration.set("fs.defaultFS","file:///");
        // 使用本地的资源（CPU,内存等）, 也可以使用yarn平台跑任务
        configuration.set("mapreduce.framework.name","local");
        Job job = Job.getInstance(configuration, "排序01");

        // 指定 map
        job.setMapperClass(PaiXuMapper.class);
        // hello 1
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(Text.class);

        // 此处也可以使用绝对路径
        FileInputFormat.setInputPaths(job,"../WordCount/mr04/input/");
        FileOutputFormat.setOutputPath(job,new Path("../WordCount/mr04/output"));

        boolean result = job.waitForCompletion(true);

        // 返回结果如果为true表示任务成功了，正常退出，否则非正常退出
        System.exit(result?0:-1);
    }
}

案例之·斗鱼主播日志数据按照观众人数降序排序？

# 自定义排序
# 案例
团团  300
小黑  200
哦吼  400
卢本伟  100
八戒  250
悟空  100
唐僧  100


# 期望
哦吼  400
团团  300
八戒  250
小黑  200
卢本伟  100
悟空  100
唐僧  100

关键代码提示：

# 1. 需要自定义Mapper输出的key的类型，实现WritableComparable接口
# 2. 实现compareTo方法。
# 3. 补齐write和readField的序列化相关方法

package com.bigdata.paixu01;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class IntWritable2 implements WritableComparable<IntWritable2> {

    private int value;

    public IntWritable2() {}

    public IntWritable2(int value) { set(value); }

    /** Set the value of this IntWritable. */
    public void set(int value) { this.value = value; }

    /** Return the value of this IntWritable. */
    public int get() { return value; }

    // 这个方法别人忙你调用的，只要存在排序的地方就会调用这个方法。
    // java 基础  比如各种排序方法  Collections.sort(list)
    @Override
    public int compareTo(IntWritable2 o) {

        int thisValue = this.value;
        int thatValue = o.get();
        return (thisValue > thatValue ? -1 : (thisValue==thatValue ? 0 : 1));

    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeInt(value);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        value = in.readInt();
    }

    @Override
    public String toString() {
        return value+"";
    }
}

测试代码：
package com.bigdata.paixu01;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

class PaiXuMapper extends Mapper<LongWritable, Text, IntWritable,Text>{

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String s = value.toString();
        String[] split = s.split("\\s+");
        int visitNum = Integer.valueOf(split[1]);
        String name = split[0];
        context.write(new IntWritable(visitNum),new Text(name));
    }
}

class PaiXuMapper2 extends Mapper<LongWritable, Text, IntWritable2,Text>{

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String s = value.toString();
        String[] split = s.split("\\s+");
        int visitNum = Integer.valueOf(split[1]);
        String name = split[0];
        context.write(new IntWritable2(visitNum),new Text(name));
    }
}
public class PaiXu01 {

    public static void main(String[] args) throws Exception {

        Configuration configuration = new Configuration();
        // 使用本地的文件系统，而不是hdfs
        configuration.set("fs.defaultFS","file:///");
        // 使用本地的资源（CPU,内存等）, 也可以使用yarn平台跑任务
        configuration.set("mapreduce.framework.name","local");
        Job job = Job.getInstance(configuration, "排序01");

        // 指定 map
        job.setMapperClass(PaiXuMapper2.class);
        // hello 1
        job.setMapOutputKeyClass(IntWritable2.class);
        job.setMapOutputValueClass(Text.class);

        // 此处也可以使用绝对路径
        FileInputFormat.setInputPaths(job,"../WordCount/mr04/input/");
        FileOutputFormat.setOutputPath(job,new Path("../WordCount/mr04/output2"));

        boolean result = job.waitForCompletion(true);

        // 返回结果如果为true表示任务成功了，正常退出，否则非正常退出
        System.exit(result?0:-1);
    }
}

public class WatcherWritable implements WritableComparable<WatcherWritable> {
    private int watcher;
    /**
     * 在排序会调用,
     */
    public int compareTo(WatcherWritable o) {
        //写法2:降序
        return o.watcher - this.watcher;
        // 写法1：降序
        /*
        if(this.watcher>o.watcher){
            return -1;
        }else if (this.watcher == o.watcher){
            return 0;
        }else{
            return 1;
        }*/
    }
    ...
    // 省略构造方法，get set方法，序列化方法和tostring
}

优化一下：

我们的答案是数字在前，名字在后，跟人家的要求稍微不一样。
思路： 文件的输出格式是由  Key  和Value决定的。
我们可以将key输出的内容多一些，value就不输出了，不输出使用NullWritable代替。

package com.bigdata.paixu01;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

class PaiXuMapper extends Mapper<LongWritable, Text, IntWritable,Text>{

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String s = value.toString();
        String[] split = s.split("\\s+");
        int visitNum = Integer.valueOf(split[1]);
        String name = split[0];
        context.write(new IntWritable(visitNum),new Text(name));
    }
}

class PaiXuMapper2 extends Mapper<LongWritable, Text, IntWritable2,NullWritable>{

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String s = value.toString();
        String[] split = s.split("\\s+");
        int visitNum = Integer.valueOf(split[1]);
        String name = split[0];
        context.write(new IntWritable2(visitNum,name), NullWritable.get());
    }
}
public class PaiXu01 {

    public static void main(String[] args) throws Exception {

        Configuration configuration = new Configuration();
        // 使用本地的文件系统，而不是hdfs
        configuration.set("fs.defaultFS","file:///");
        // 使用本地的资源（CPU,内存等）, 也可以使用yarn平台跑任务
        configuration.set("mapreduce.framework.name","local");
        Job job = Job.getInstance(configuration, "排序01");

        // 指定 map
        job.setMapperClass(PaiXuMapper2.class);
        // hello 1
        job.setMapOutputKeyClass(IntWritable2.class);
        job.setMapOutputValueClass(NullWritable.class);

        // 此处也可以使用绝对路径
        FileInputFormat.setInputPaths(job,"../WordCount/mr04/input/");
        FileOutputFormat.setOutputPath(job,new Path("../WordCount/mr04/output2"));

        boolean result = job.waitForCompletion(true);

        // 返回结果如果为true表示任务成功了，正常退出，否则非正常退出
        System.exit(result?0:-1);
    }
}


package com.bigdata.paixu01;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class IntWritable2 implements WritableComparable<IntWritable2> {

    private int value;
    private String name;

    public IntWritable2() {}

    public IntWritable2(int value,String name) { set(value,name); }

    /** Set the value of this IntWritable. */
    public void set(int value,String name ) {
        this.value = value;
        this.name = name;
    }

    /** Return the value of this IntWritable. */
    public int get() { return value; }

    // 这个方法别人忙你调用的，只要存在排序的地方就会调用这个方法。
    // java 基础  比如各种排序方法  Collections.sort(list)
    @Override
    public int compareTo(IntWritable2 o) {

        int thisValue = this.value;
        int thatValue = o.get();
        return (thisValue > thatValue ? -1 : (thisValue==thatValue ? 0 : 1));

    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeInt(value);
        out.writeUTF(name);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        value = in.readInt();
        name = in.readUTF();
    }

    @Override
    public String toString() {
        return name+" "+value;
    }
}

案例之·主播数据按照观众人数降序排序，如果观众人数相同，按照直播时长降序

# 案例数据
用户id  观众人数  直播时长
团团  300  1000
小黑  200  2000
哦吼  400  7000
卢本伟  100  6000
八戒  250  5000
悟空  100  4000
唐僧  100  3000

# 期望结果
哦吼  400  7000
团团  300  1000
八戒  250  5000
小黑  200  2000
卢本伟  100  6000
悟空  100  4000
唐僧  100  3000

关键代码：

public class PlayWritable implements WritableComparable<PlayWritable> {
    private int viewer;
    private int length;

    /**
     * 按照viewer降序，如果viewer相同，按照length降序
     * @param o
     * @return
     */
    public int compareTo(PlayWritable o) {
        if(this.viewer != o.viewer){
            return o.viewer - this.viewer;
        }else{
            return o.length - this.length;
        }
    }
    
    
    // 省略构造方法，get set方法，序列化方法和tostring
    ...
}

规律：
1)  排序是按照Key值排序
2）mapper 是可以单独使用的，假如只有mapper ，这个时候 set ReducerNum 为 0
3） 不管是mapper输出还是reducer输出，输出的格式是按照 Key的输出+ Value输出。Key的输出和Value的输出格式是按照这个类的 toString方法格式输出的。假如只想输出key ，就将value设置为 NullWritable，假如只想输出value,Key的类型就是NullWritable。
4）假如你有分区，分区的泛型，跟mapper的输出照应。

实例代码：

package com.bigdata.paixu01;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

class PaiXuMapper3 extends Mapper<LongWritable, Text, IntWritable2,NullWritable>{

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String s = value.toString();
        String[] split = s.split("\\s+");
        int visitNum = Integer.valueOf(split[1]);
        int shiChang = Integer.valueOf(split[2]);
        String name = split[0];
        context.write(new IntWritable2(visitNum,name,shiChang), NullWritable.get());
    }
}

package com.bigdata.paixu01;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class IntWritable2 implements WritableComparable<IntWritable2> {

    private int value;
    private String name;
    private int shiChang;

    public IntWritable2() {}

    public IntWritable2(int value,String name) { set(value,name); }

    public IntWritable2(int value,String name,int shiChang) { set(value,name,shiChang); }

    // 方法的重载
    public void set(int value) {
        this.value = value;
    }

    /** Set the value of this IntWritable. */
    public void set(int value,String name ) {
        this.value = value;
        this.name = name;
    }

    public void set(int value,String name,int shiChang ) {
        this.value = value;
        this.name = name;
        this.shiChang = shiChang;
    }

    /** Return the value of this IntWritable. */
    public int get() { return value; }

    // 这个方法别人忙你调用的，只要存在排序的地方就会调用这个方法。
    // java 基础  比如各种排序方法  Collections.sort(list)
    @Override
    public int compareTo(IntWritable2 o) {

        int thisValue = this.value;
        int thatValue = o.get();
        if(thisValue != thatValue){
            return thatValue - thisValue;
        }else {
            return o.shiChang - this.shiChang;
        }

    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeInt(value);
        out.writeUTF(name);
        out.writeInt(shiChang);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        value = in.readInt();
        name = in.readUTF();
        shiChang = in.readInt();
    }

    @Override
    public String toString() {
        return name+" "+value+" "+shiChang;
    }
}

Driver：

public class PaiXu01 {

    public static void main(String[] args) throws Exception {

        Configuration configuration = new Configuration();
        // 使用本地的文件系统，而不是hdfs
        configuration.set("fs.defaultFS","file:///");
        // 使用本地的资源（CPU,内存等）, 也可以使用yarn平台跑任务
        configuration.set("mapreduce.framework.name","local");
        Job job = Job.getInstance(configuration, "排序01");

        // 指定 map
        job.setMapperClass(PaiXuMapper3.class);
        // hello 1
        job.setMapOutputKeyClass(IntWritable2.class);
        job.setMapOutputValueClass(NullWritable.class);

        job.setNumReduceTasks(0);

        // 此处也可以使用绝对路径
        FileInputFormat.setInputPaths(job,"../WordCount/mr08/input/");
        FileOutputFormat.setOutputPath(job,new Path("../WordCount/mr08/output3"));

        boolean result = job.waitForCompletion(true);

        // 返回结果如果为true表示任务成功了，正常退出，否则非正常退出
        System.exit(result?0:-1);
    }
}

七、Main方法如何传参

以问题驱动学习：

假如你到公司中，如何自己学习？你们公司正在使用的技术是什么就学什么，特别大的技术，不要学。

package com.bigdata;

public class TestMain {

    /**
     *  args 是一个字符串数组，谁给它可以赋值呢？
     * @param args
     */
    public static void main(String[] args) {

        System.out.println("参数打印开始");
        for (String str:args) {
            System.out.println(str);
        }
        System.out.println("参数打印完毕");
    }
}

思考：假如你的这个代码不在idea中如何传参？
任何java代码都可以打成jar包，jar包中的文件如何运行？
java -jar  xxxx.jar  某个类的全路径  如果这个类有参数，直接跟在后面
java -jar  hello.jar   com.bigdata.TestMain 10 20 30

YuPangZa

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
打赏
1
评论
05-hadoop03之MapReduce

1）实现Writable 接口 2）实现WritableComparable接口如果自定义的数据类型，是当做Key值，因为Key值必须能排序才行，所以需要实现WritableComparable接口，当做Value值，直接实现Writable接口即可。手机号码Key PhoneFlowWritable 是Value在MR程序中，只有能排序的数据类型才能当做Key。因为Key需要排序。Map端很多地方输出的结果都是有序的，什么有序？--》 key值有序。/**
复制链接

扫一扫