05-hadoop03之MapReduce

一、补充

1、如何实现自定义序列化

1)实现Writable 接口 2) 实现WritableComparable接口

如果自定义的数据类型,是当做Key值,因为Key值必须能排序才行,所以需要实现WritableComparable接口,当做Value值,直接实现Writable接口即可。

手机号码Key PhoneFlowWritable 是Value

在MR程序中,只有能排序的数据类型才能当做Key。 因为Key需要排序。

Map端很多地方输出的结果都是有序的,什么有序?--》 key值有序。

public interface WritableComparable<T> extends Writable, Comparable<T> {
}
package com.bigdata.day12;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 * @Author laoyan
 * @Description TODO
 * @Date 2022/8/2 9:39
 * @Version 1.0
 *
 *  这种数据类型,可以当做Key值,因为它具有排序的功能,当然也可以当做Value

     自定义数据类型的时候什么情况下实现 Writable?
     什么情况下实现WritableComparable?
     就看你自定义的类型当不当做Key.
 */
public class AdminWritable implements WritableComparable<AdminWritable> {

    private String name;
    private int age;

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public int getAge() {
        return age;
    }

    public void setAge(int age) {
        this.age = age;
    }

    @Override
    public int compareTo(AdminWritable admin) {
        // 根据年龄进行排序
        return this.age - admin.getAge();
    }

    @Override
    public void write(DataOutput out) throws IOException {
         // 此处省略
        out.writeUTF(name);
        out.writeInt(age);
    }

    @Override
    public void readFields(DataInput in) throws IOException {

        this.name = in.readUTF();
        this.age = in.readInt();
    }
}

2、优化一段代码

遇到的问题:

解决方案:

就是重复利用new Text() new PhoneFlowWritable() 对象即可。

优化过之后的代码

package com.bigdata.phoneflow;

import com.bigdata.WordCountMapper;
import com.bigdata.WordCountPartitioner;
import com.bigdata.WordCountReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.Iterator;

/**
 * @Author laoyan
 * @Description TODO
 * @Date 2022/8/1 14:33
 * @Version 1.0
 */
class PhoneFlowMapper extends Mapper<LongWritable, Text,Text,PhoneFlowWritable> {

    // 该方法,只执行一次,一般的初始化操作都会在这个方法里面完成
    Text text = null;
    PhoneFlowWritable phoneFlowWritable = null;
    @Override
    protected void setup(Mapper<LongWritable, Text, Text, PhoneFlowWritable>.Context context) throws IOException, InterruptedException {
        text = new Text();
        phoneFlowWritable = new PhoneFlowWritable();
    }

    // 将每一句话,都变为   手机号码 -->  PhoneFlowWritable对象
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, PhoneFlowWritable>.Context context) throws IOException, InterruptedException {

        //1363157986072 	18320173382	84-25-DB-4F-10-1A:CMCC-EASY	120.196.100.99	input.shouji.sogou.com	搜索引擎	21	18	9531	2412	200
        String line = value.toString();
        String[] arr = line.split("\\s+");// \s 表示一个空白符,+ 表示出现一次到多次
        String phone = arr[1];
        int upFlow = Integer.parseInt(arr[arr.length-3]);
        int downFlow = Integer.parseInt(arr[arr.length-2]);

        text.set(phone);
        phoneFlowWritable.setPhone(phone);
        phoneFlowWritable.setUpFlow(upFlow);
        phoneFlowWritable.setDownFlow(downFlow);
        context.write(text ,phoneFlowWritable);
    }
}
//   手机号 --> 流量数据PhoneFlowWritable      手机号码 --> 统计的结果
class PhoneFlowReducer extends Reducer<Text,PhoneFlowWritable,Text,Text> {

    Text value = null;
    @Override
    protected void setup(Reducer<Text, PhoneFlowWritable, Text, Text>.Context context) throws IOException, InterruptedException {
        value =new Text();
    }

    // reduce 会将相同的key值,汇总在一起,将value值合并到一个迭代器中
    @Override
    protected void reduce(Text key, Iterable<PhoneFlowWritable> values, Reducer<Text, PhoneFlowWritable, Text, Text>.Context context) throws IOException, InterruptedException {

        int upFlowNum = 0;
        int downFlowNum = 0;
        Iterator<PhoneFlowWritable> iterator = values.iterator();
        while(iterator.hasNext()){
            PhoneFlowWritable phoneFlowWritable = iterator.next();
            upFlowNum += phoneFlowWritable.getUpFlow();
            downFlowNum += phoneFlowWritable.getDownFlow();
        }
        StringBuffer sb = new StringBuffer();
        sb.append("手机号"+key+"流量统计:");
        sb.append("上行流量是:"+upFlowNum);
        sb.append("下行流量是:"+downFlowNum);
        sb.append("总的流量是:"+(upFlowNum + downFlowNum));

        value.set(sb.toString());
        context.write(key,value);

    }
}



public class PhoneFlowDriver {

    public static void main(String[] args) throws Exception{

        Configuration configuration = new Configuration();
        // 使用本地的文件系统,而不是hdfs
        configuration.set("fs.defaultFS","file:///");
        // 使用本地的资源(CPU,内存等), 也可以使用yarn平台跑任务
        configuration.set("mapreduce.framework.name","local");
        Job job = Job.getInstance(configuration, "手机流量统计");
        // map任务的设置
        job.setMapperClass(PhoneFlowMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(PhoneFlowWritable.class);

        // reduce任务的设置
        job.setReducerClass(PhoneFlowReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        // 设置要统计的数据的路径,结果输出路径
        FileInputFormat.setInputPaths(job,new Path("mr01/phoneFlow/input"));
        // ouput文件夹一定不要出现,否则会报错
        FileOutputFormat.setOutputPath(job,new Path("mr01/phoneFlow/output"));
        // 等待任务执行结束
        boolean b = job.waitForCompletion(true);
        // 此处是一定要退出虚拟机的
        System.exit(b ? 0:-1);
    }
}

waitForCompletion方法的boolean参数verbose为true表明要打印运行进度,为false就只是等待job运行结束,不打印运行日志。

二、统计最高温度

0188010010999992000010100004+70930-008670FM-12+0009ENJA V0202101N002110021019N0025001N1+00101+00031098181ADDAA106004191AY181061AY251061GF108991081061002501999999MA1999999098061MD1510071+9999MW1501REMSYN088AAXX  01004 01001 11325 82104 10010 20003 39806 49818 55007 60041 75085 886// 333 91119;
    

需求:求每一年的最高温度
数据格式的说明:
1、每一行的 【15,18】年份
2、87位指的是 温度的正负
3、【88,91】 指的是温度,如果温度是9999表示无效
4、92位是校验位,如果是0,1,4,5,9 这几个值表示温度有效。

思路

Map 端:   Key值是年份   温度
Reduce端: 到reduce长什么样子呢?  key  [3243,3434,34343,34343]
   根据集合中的数据,求最大值,写出到磁盘即可。
package com.bigdata.temp;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

class TempMaxMapper extends Mapper<LongWritable, Text,Text, IntWritable>{

    Text year = null;
    IntWritable temp = null;
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {

        year = new Text();
        temp = new IntWritable();
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        // 最终的结果   年份  温度
        String line = value.toString();
        // 获取年份
        String _year = line.substring(15, 19);
        // 获取温度以及符号
        int _temp = Integer.parseInt(line.substring(87, 92));
        String validateCode = line.substring(92, 93);//获取校验码
        if(_temp  == 9999 || validateCode.matches("[^01459]")){
            return ; // 表示代码终止
        }
        year.set(_year);
        temp.set(_temp);
        context.write(year,temp);
    }
}

class TempMaxReducer extends Reducer<Text, IntWritable,Text, Text>{

    Text text = null;

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        text = new Text();
    }

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
       // 1901 [xxx,xxx,xxx]
        int maxTemp = Integer.MIN_VALUE; // 因为设置为哪个数字都不合适
        for (IntWritable temp:values) {
            // 求两个值的最大值
            maxTemp = Integer.max(maxTemp,temp.get());
        }


        text.set("这一年的最高温是"+maxTemp);
        context.write(key,text);
    }
}


public class TempMaxDriver {


    public static void main(String[] args) throws Exception {
        Configuration configuration = new Configuration();
        // 使用本地的文件系统,而不是hdfs
        configuration.set("fs.defaultFS","file:///");
        // 使用本地的资源(CPU,内存等), 也可以使用yarn平台跑任务
        configuration.set("mapreduce.framework.name","local");
        Job job = Job.getInstance(configuration, "统计最高温");

        // 指定 map
        job.setMapperClass(TempMaxMapper.class);
        // hello 1
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        // 指定 reduce
        job.setReducerClass(TempMaxReducer.class);
        // hello 5
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        // 此处也可以使用绝对路径
        FileInputFormat.setInputPaths(job,"../WordCount/mr03/input/");
        FileOutputFormat.setOutputPath(job,new Path("../WordCount/mr03/output"));

        // 这个true表示将日志打印给前端,false 表只执行
        boolean result = job.waitForCompletion(true);

        // 返回结果如果为true表示任务成功了,正常退出,否则非正常退出
        System.exit(result?0:-1);
    }
}

三、求TopN案例

求每个人,他打分最高的五部电影。

1、有一种数据的结构,叫JSON (JavaScript)

JSON: JavaScript Object Notation(JavaScript 对象标记法)

H5: 编写页面的

Java:编写后端代码的

H5页面和Java之间是要传递数据的,传递的数据可以有很多种形式,xml和JSON。

2、如何表示

JSON 语法衍生于 JavaScript 对象标记法语法:

数据在名称/值对中   "name":"老闫"
数据由逗号分隔      "name":"老闫","age":30
花括号容纳对象     {"name":"老闫","age":30}
方括号容纳数组     [{"name":"老闫","age":30},{"name":"老闫2","age":10}]

image.png

rating.json

{"movie":"1193","rate":"5","timeStamp":"978300760","uid":"1"}
{"movie":"661","rate":"3","timeStamp":"978302109","uid":"1"}
{"movie":"914","rate":"3","timeStamp":"978301968","uid":"1"}
{"movie":"3408","rate":"4","timeStamp":"978300275","uid":"1"}
{"movie":"2355","rate":"5","timeStamp":"978824291","uid":"1"}
{"movie":"1197","rate":"3","timeStamp":"978302268","uid":"1"}
{"movie":"1287","rate":"5","timeStamp":"978302039","uid":"1"}
{"movie":"2804","rate":"5","timeStamp":"978300719","uid":"1"}
{"movie":"594","rate":"4","timeStamp":"978302268","uid":"1"}

3、导入工具包
因为我们想将json数据转换为Java 对象 -- Jackson

  <!--导入jackson工具包-->
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-core</artifactId>
            <version>2.9.5</version>
        </dependency>
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-annotations</artifactId>
            <version>2.9.5</version>
        </dependency>
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-databind</artifactId>
            <version>2.9.5</version>
        </dependency>

mavan的一个小用法,先声明版本号,再使用版本号:

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  
  <groupId>com.bigdata</groupId>
  <artifactId>HadoopDay11</artifactId>
  <version>1.0-SNAPSHOT</version>
  <!--设置打包方式,为jar
       此处可以设置三个值:
       jar   纯java项目,打包jar
       war   web项目,打包war
       pom   maven的父子工程,父工程就是pom
    -->
  <packaging>jar</packaging>
  
  <properties>
    <maven.compiler.source>8</maven.compiler.source>
    <maven.compiler.target>8</maven.compiler.target>
    <!--声明一个变量-->
    <jackson.version>2.9.5</jackson.version>
    <hadoop.version>3.3.1</hadoop.version>
  </properties>
  
  <dependencies>
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-common</artifactId>
      <version>${hadoop.version}</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-client</artifactId>
      <version>${hadoop.version}</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs -->
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-hdfs</artifactId>
      <version>${hadoop.version}</version>
    </dependency>
    
    <dependency>
      <groupId>ch.qos.logback</groupId>
      <artifactId>logback-classic</artifactId>
      <version>1.0.6</version>
    </dependency>
    
    <!--导入jackson工具包-->
    <dependency>
      <groupId>com.fasterxml.jackson.core</groupId>
      <artifactId>jackson-core</artifactId>
      <version>${jackson.version}</version>
    </dependency>
    <dependency>
      <groupId>com.fasterxml.jackson.core</groupId>
      <artifactId>jackson-annotations</artifactId>
      <version>${jackson.version}</version>
    </dependency>
    <dependency>
      <groupId>com.fasterxml.jackson.core</groupId>
      <artifactId>jackson-databind</artifactId>
      <version>${jackson.version}</version>
    </dependency>
  </dependencies>
  
</project>

4、编写一个实体,用于映射json数据
 

package com.bigdata.day12.topn;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
* @Author laoyan
* @Description TODO
* @Date 2022/8/2 14:26
* @Version 1.0
*/
public class RatingWritable implements Writable {
    
    private String movie;
    private int rate;
    private String timeStamp;
    private String uid;
    
    public RatingWritable() {
    }
    
    public RatingWritable(String movie, int rate, String timeStamp, String uid) {
        this.movie = movie;
        this.rate = rate;
        this.timeStamp = timeStamp;
        this.uid = uid;
    }
    
    public String getMovie() {
        return movie;
    }
    
    public void setMovie(String movie) {
        this.movie = movie;
    }
    
    public int getRate() {
        return rate;
    }
    
    public void setRate(int rate) {
        this.rate = rate;
    }
    
    public String getTimeStamp() {
        return timeStamp;
    }
    
    public void setTimeStamp(String timeStamp) {
        this.timeStamp = timeStamp;
    }
    
    public String getUid() {
        return uid;
    }
    
    public void setUid(String uid) {
        this.uid = uid;
    }
    
    @Override
    public String toString() {
        return "RatingWritable{" +
            "movie='" + movie + '\'' +
            ", rate=" + rate +
            ", timeStamp='" + timeStamp + '\'' +
            ", uid='" + uid + '\'' +
            '}';
    }
    
    @Override
    public void write(DataOutput out) throws IOException {
        
        out.writeUTF(movie);
        out.writeInt(rate);
        out.writeUTF(timeStamp);
        out.writeUTF(uid);
    }
    
    @Override
    public void readFields(DataInput in) throws IOException {
        
        movie = in.readUTF();// ctrl + d
        rate = in.readInt();
        timeStamp = in.readUTF();
        uid = in.readUTF();
    }
}

修改成如下代码即可:

image.png

完整代码展示:

package com.bigdata.topN;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.codehaus.jackson.map.ObjectMapper;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;

class TopNMapper extends Mapper<LongWritable, Text, IntWritable,MovieWritable>{

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        /**
         *  读取一行数据,得到一个json ,解析json ,获取 uid 和 name  和评分
         */
        String jsonStr = value.toString();
        // 如何将一个json解析呢? 可以使用如下工具类: jackson fastjson gson 等

        ObjectMapper objectMapper = new ObjectMapper();
        MovieWritable movie = objectMapper.readValue(jsonStr, MovieWritable.class);
        System.out.println("mapper端数据:"+movie);
        context.write(new IntWritable(movie.getUid()),movie);
    }
}

class TopNReducer extends Reducer<IntWritable,MovieWritable,Text, NullWritable>{


    @Override
    protected void reduce(IntWritable key, Iterable<MovieWritable> values, Context context) throws IOException, InterruptedException {
        //  uid  后面是他评价的所有电影信息

        // 后面开始进行排序,取前五名
        List<MovieWritable> list = new ArrayList<MovieWritable>();
        for (MovieWritable movie:values) {
            // 不能在这个地方直接add ,否则数据会重复
            // list.add(movie);
            MovieWritable rate = new MovieWritable();
            rate.setRate(movie.getRate());
            rate.setMovie(movie.getMovie());
            rate.setTimeStamp(movie.getTimeStamp());
            rate.setUid(movie.getUid());
            list.add(rate);
        }
        System.out.println(list);
        // 排好了顺序
        Collections.sort(list, new Comparator<MovieWritable>() {
            @Override
            public int compare(MovieWritable m1, MovieWritable m2) {
                return m2.getRate() - m1.getRate();
            }
        });

        System.out.println(list);

        // 取前五个
        int length = Math.min(5,list.size());
        // StringBuffer 和 StringBuilder      StringBuffer是线程安全的
        StringBuffer sb =new StringBuffer(key.get()+"最喜欢的五部的电影是:\n");
        for (int i = 0; i < length; i++) {
            MovieWritable movie = list.get(i);
            sb.append(movie.getMovie()+",分数为:"+movie.getRate()+"\n");
        }
        context.write(new Text(sb.toString()),NullWritable.get());

    }
}
public class TopNDriver {

    public static void main(String[] args) throws Exception{

        Configuration configuration = new Configuration();
        // 使用本地的文件系统,而不是hdfs
        configuration.set("fs.defaultFS","file:///");
        // 使用本地的资源(CPU,内存等), 也可以使用yarn平台跑任务
        configuration.set("mapreduce.framework.name","local");
        Job job = Job.getInstance(configuration, "电影排名");

        // 指定 map
        job.setMapperClass(TopNMapper.class);
        // hello 1
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(MovieWritable.class);

        job.setReducerClass(TopNReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        // 此处也可以使用绝对路径
        FileInputFormat.setInputPaths(job,"../WordCount/mr05/input/");
        FileOutputFormat.setOutputPath(job,new Path("../WordCount/mr05/output"));

        boolean result = job.waitForCompletion(true);

        // 返回结果如果为true表示任务成功了,正常退出,否则非正常退出
        System.exit(result?0:-1);
    }
}

 结果展示:

image.png

 

四、Yarn

1、概念

Hadoop三大件:HDFS、MapReduce、Yarn

Yarn其实就是一个类似于操作系统一样的东西。

Yarn是MapReduce运行的环境,Yarn可以管理程序运行所需要的东西(内存,CPU,带宽等资源)

image.png

Yarn诞生于Hadoop,但是现在已经脱离了Hadoop,变成了一个独立的软件,系统。 

2、Yarn的组成部分

我们的Yarn,其实有两大部分组成: 

        必须清除

1、ResourceManager (BOSS):  1个
 他用来管理整个的Yarn平台,里面有一个资源调度器。
2、NodeManager (各个机器上的主管)  多个
   听从我们的ResouceManager的调遣。是每一台电脑的管家。
3、Container(容器)
   每一个NodeManager中,有一个或者多个这样的容器。是包含了一些资源的封装(CPU,内存,硬盘等),类似于我们熟悉的虚拟机。
4、AppMaster (项目经理)
   每一个MapReduce任务启动提交后,会有一个对应的AppMaster。这个主要作用是负责整个job任务的运行。
3、Yarn如何进行配置和搭建
/opt/installs/hadoop/etc/hadoop 文件夹下:

mapred-site.xml

<configuration>
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
    </property>
</configuration>

指定mapreduce运行平台为yarn

yarn-site.xml

<!--指定resourceManager启动的主机为第一台服务器-->
    <property>
        <name>yarn.resourcemanager.hostname</name>
        <value>bigdata01</value>
    </property>


    <!--配置yarn的shuffle服务-->
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value> 
    </property>

检查hadoop-env.sh 中是否配置了权限:

export YARN_RESOURCEMANAGER_USER=root
export YARN_NODEMANAGER_USER=root

继续配置:为了防止报AppMaster的错误,需要如下配置

yarn-site.xml

<property>
        <name>yarn.application.classpath</name>
        <value>/opt/installs/hadoop/etc/hadoop:/opt/installs/hadoop/share/hadoop/common/lib/*:/opt/installs/hadoop/share/hadoop/common/*:/opt/installs/hadoop/share/hadoop/hdfs:/opt/installs/hadoop/share/hadoop/hdfs/lib/*:/opt/installs/hadoop/share/hadoop/hdfs/*:/opt/installs/hadoop/share/hadoop/mapreduce/*:/opt/installs/hadoop/share/hadoop/yarn:/opt/installs/hadoop/share/hadoop/yarn/lib/*:/opt/installs/hadoop/share/hadoop/yarn/*</value> 
    </property>

获取classpath的值:

分发mapred-site.xml & yarn-site.xml 到另外两台电脑上。
 

cd /opt/installs/hadoop/etc/hadoop/

xsync.sh mapred-site.xml yarn-site.xml

启动和停止yarn平台:
 

启动: start-yarn.sh
停止: stop-yarn.sh

也可以使用web访问一下:

跟ResourceManager电脑的IP保持一致

http://192.168.233.128:8088

image.png

4、关于启动和停止的命令 

5、使用yarn平台进行wordCount计算

将一个wc.txt 上传至hdfs平台,然后通过yarn平台进行计算

数据

hadoop spark hello hadoop
spark hello flink world
scala python python scala

运行hadoop自带的workCount:

hadoop jar /opt/installs/hadoop/share/hadoop/mapreduce/hadoop-mapreduce-examples-3.3.1.jar wordcount /home/wc.txt /home/output

我们还可以在Yarn平台上查看运行的情况:

image.png

以上这个案例使用的就是yarn运行,数据必须在hdfs上,yarn也必须启动。代码虽然在本地,但是也会上传到hdfs的。

 

五、MapReduce任务有三种运行开发模式

1、local模式

数据在本地,代码也在本地,使用本机的电脑的资源运行我们的MR

输入和输出路径指的都是本地路径,运行时耗费的资源也是本地资源。

2、local模式2

数据在hdfs上,代码在本地,使用本机的电脑的资源运行我们的MR

  System.setProperty("HADOOP_USER_NAME","root");    
        Configuration configuration = new Configuration();
        
        configuration.set("fs.defaultFS","hdfs://192.168.32.128:9820");
        // 使用本地的资源(CPU,内存等), 也可以使用yarn平台跑任务
        configuration.set("mapreduce.framework.name","local");

这个里面的输入和输出路径指的是hdfs上的路径。

3、Yarn模式

数据在hdfs上,代码在yarn上。
 

  System.setProperty("HADOOP_USER_NAME","root");    
        Configuration configuration = new Configuration();
        
        configuration.set("fs.defaultFS","hdfs://192.168.32.128:9820");
        
        configuration.set("mapreduce.framework.name","yarn");

        // 跨平台任务提交打开
        configuration.set("mapreduce.app-submission.cross-platform", "true");	

案例:使用Yarn运行自己编写的WordCount:

修改代码如下:

package com.bigdata.day12.workcount;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * @Author laoyan
 * @Description TODO
 * @Date 2022/8/1 10:02
 * @Version 1.0
 */
public class WordCountDriver2 {
    public static void main(String[] args) throws Exception {

        System.setProperty("HADOOP_USER_NAME","root");
        Configuration configuration = new Configuration();
        // 使用本地的文件系统,而不是hdfs
        configuration.set("fs.defaultFS","hdfs://192.168.32.128:9820");
        // 使用本地的资源(CPU,内存等), 也可以使用yarn平台跑任务
        configuration.set("mapreduce.framework.name","yarn");
        //要解决客户端在windows上运行,同时将MapTask和ReduceTask提交到linux集群上运行产生的冲突问题,就要修改下mapReduce的一些默认配置
        configuration.set("mapreduce.app-submission.cross-platform", "true");
        Job job = Job.getInstance(configuration, "老闫在yarn上运行workCount");
        // map任务的设置

        // 这句话不要忘记添加
        job.setJarByClass(WordCountDriver2.class);

        job.setMapperClass(WordCountMapper2.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        // 指定分区的类是哪一个
        job.setPartitionerClass(WordCountPartitioner2.class);
        // 还要执行 reduce的数量  因为一个reduce 就会产生一个结果文件
     

        // reduce任务的设置
        job.setReducerClass(WordCountReducer2.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        // 设置要统计的数据的路径,结果输出路径
        FileInputFormat.setInputPaths(job,new Path(args[0]));
        // ouput文件夹一定不要出现,否则会报错
        FileOutputFormat.setOutputPath(job,new Path(args[1]));
        // 等待任务执行结束
        boolean b = job.waitForCompletion(true);
        // 此处是一定要退出虚拟机的
        System.exit(b ? 0:-1);
    }
}

将程序达成jar包:

将我们的大好的Jar包上传至linux服务器,运行该jar包:

hadoop jar WC.jar com.bigdata.day12.workcount.WordCountDriver2 /input /oottpp2

jar包的名字最好短一点
com.bigdata.day12.workcount.WordCountDriver2   这个是Main方法所在的类的全路径
/input   hdfs上文件的路径
/oottpp2  hdfs上数据的统计的输出路径

如果出现web端查看错误: 

image.png

 

记得dfs-site.xml中添加如下配置,重启集群:

<property>
    <name>dfs.webhdfs.enabled</name>
    <value>true</value>
</property>
接着在本地windows系统的hosts文件中,添加映射
配置浏览器所在系统的 hosts 文件
windows:
在 C:\Windows\System32\drivers\etc\hosts 末尾增加内容(Hadoop集群中各节点及主机名的映射)

 

六、练习

1、数据清洗

# 数据清洗概念
  通常情况下,大数据平台获得原始数据文件中,存在大量无效数据和缺失数据,需要再第一时间,对数据进行清洗,获得符合后续处理需求的数据内容和格式
  
# 需求
  对手机流量原始数据,将其中的手机号为"null"和不完整的数据去除

数据格式:

# 源数据
id             手机号        手机mac                 ip地址                  上传    下载  HTTP状态码
1363157985066  13726230503  00-FD-07-A4-72-B8:CMCC  120.196.100.82  24  27  2481  24681  200
1363157995052  13826544101  5C-0E-8B-C7-F1-E0:CMCC  120.197.40.4  4  0  264  0  200
1363157991076  13926435656  20-10-7A-28-CC-0A:CMCC  120.196.100.99  2  4  132  1512  200
1363154400022  13926251106  5C-0E-8B-8B-B1-50:CMCC  120.197.40.4  4  0  240  0  200
1363157985066  13726230503  00-FD-07-A4-72-B8:CMCC  120.196.100.82  24  27  2481  24681  200
1363157995052  13826544101  5C-0E-8B-C7-F1-E0:CMCC  120.197.40.4  4  0  264  0  200
1363157991076  13926435656  20-10-7A-28-CC-0A:CMCC  120.196.100.99  2  4  132  1512  200
1363154400022  13926251106  5C-0E-8B-8B-B1-50:CMCC  120.197.40.4  4  0  240  0  200
1363157995052  13826544109  5C-0E-8B-C7-F1-E0:CMCC  120.197.40.4  4  0
1363157995052  null  5C-0E-8B-C7-F1-E0:CMCC  120.197.40.4  4  0  240  0  200
1363157991076  13926435659  20-10-7A-28-CC-0A:CMCC  120.196.100.99  2  4  null  null  null

# 期望结果【删除其中手机号不符合要求,上传流量确实和下载流量确实的数据,并仅保留手机号 上传流量 下载流量。】
13726230503  2481  24681
13826544101  264  0
13926435656  132  1512
13926251106  240  0
13726230503  2481  24681
13826544101  264  0
13926435656  132  1512
13926251106  240  0 

编码提示:

# 重点:
  MapReduce整个流程中可以取消reduce阶段的程序执行,map输出的会直接作为结果输出到HDFS文件中。
  
# 编码实现
1. 删除job中有关reducer的相关设置:reducer类和输出的key value类型。
2. 手动设置reducetask的个数为0
   job.setNumReduceTasks(0);//取消reducer
package com.bigdata;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

class CleanLogMapper extends Mapper<LongWritable, Text,Text,Text>{

    Text text = null;
    Text text2 = null;
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {

        text = new Text();
        text2 = new Text();
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        String[] split = line.split("\\s+");
        String phone = split[1];
        String upFlow = split[split.length-3];
        String downFlow = split[split.length-2];
        if(phone.equals("null") || upFlow.equals("null") || downFlow.equals("null") || split.length != 9){
            return ;
        }
        text.set(phone);
        text2.set(upFlow+" "+downFlow);
        context.write(text,text2);
    }
}
public class CleanLogDriver {

    public static void main(String[] args) throws  Exception{

        Configuration conf = new Configuration();
        conf.set("fs.defaultFS","file:///");
        conf.set("mapreduce.framework.name","local");
        Job job = Job.getInstance(conf, "清理数据");
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        // 为什么要加这句话: 因为我们没有写reduce 但是默认还是会走reduce,所以需要添加reduce的数量为 0
        job.setNumReduceTasks(0);

        job.setMapperClass(CleanLogMapper.class);
        // 一般 不写,除非在yarn上运行,必须写,就是DriverClass
        job.setJarByClass(CleanLogDriver.class);

        // 这个地方跟以前可以不一样
        FileInputFormat.addInputPath(job,new Path("../WordCount/mr06/input"));
        FileOutputFormat.setOutputPath(job,new Path("../WordCount/mr06/output"));

        // 每运行一会儿就追一下日志,查看进度的
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : -1);
    }
}

2、自定义分区案例

案例之·学生成绩统计分析

# 将学生成绩,按照各科成绩降序排序,各个科目成绩单独输出。

数据如下:

# 自定义partition
将下面数据分区处理:
人名  科目 成绩
张三  语文  10
李四  数学  30
王五  语文  20
赵6  英语  40
张三  数学  50
李四  语文  10
张三  英语  70
李四  英语  80
王五  英语  45
王五  数学  10
赵6  数学  10
赵6  语文  100
package com.bigdata.score;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

class ScorePartitonerMapper extends Mapper<LongWritable, Text, Text,Student>{
    Text text = null; // 学科

    //  先按照相同的学科汇总在一起,然后按照成绩排序,排序后成绩倒序输出

    Student student = null;

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {

        // 因为成绩要倒序排序,默认的IntWritable是正序的
        text = new Text();
        student = new Student();
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        // 这个是跳过第一行数据,假如需要跳过第二行,也需要计算第一行的长度
        if(key.get() == 0){
            return ;
        }
        String line = value.toString();
        String[] split = line.split("\\s+");
        String subject = split[1];
        String name = split[0];
        int score = Integer.valueOf(split[2]);

        text.set(subject);

        student.setName(name);
        student.setScore(score);
        student.setSubject(subject);

        context.write(text,student);
    }
}

class ScorePartitoner extends Partitioner<Text,Student>{

    @Override
    public int getPartition(Text subjectText, Student student, int i) {

        String subjectName = subjectText.toString();
        switch (subjectName) {
            case "语文":
                return 0;
            case "数学":
                return 1;
            case "英语":
                return 2;
            default:
                return 3;
        }
    }
}

class Student implements WritableComparable<Student>{

    private String name;
    private int score;
    private String subject;

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public int getScore() {
        return score;
    }

    public void setScore(int score) {
        this.score = score;
    }

    public String getSubject() {
        return subject;
    }

    public void setSubject(String subject) {
        this.subject = subject;
    }

    @Override
    public int compareTo(Student o) {
        return   o.getScore() -this.score;
    }

    @Override
    public void write(DataOutput out) throws IOException {

        out.writeUTF(name);
        out.writeInt(score);
        out.writeUTF(subject);
    }

    @Override
    public void readFields(DataInput in) throws IOException {

        name = in.readUTF();
        score = in.readInt();
        subject = in.readUTF();
    }

    @Override
    public String toString() {
        return name + " " +subject + " " + score ;
    }
}

class ScoreReduer extends Reducer<Text,Student,Text, NullWritable>{
    @Override
    protected void reduce(Text key, Iterable<Student> values, Context context) throws IOException, InterruptedException {

        List<Student> list = new ArrayList<>();
        for (Student stu:values) {
            Student student = new Student();
            student.setSubject(stu.getSubject());
            student.setScore(stu.getScore());
            student.setName(stu.getName());

            list.add(student);
        }

        Collections.sort(list);

        for (Student s:list) {
            Text text = new Text(s.toString());
            context.write(text,NullWritable.get());
        }

    }
}


public class ScorePartitionerDriver {

    public static void main(String[] args) throws  Exception{

        Configuration conf = new Configuration();
        conf.set("fs.defaultFS","file:///");
        conf.set("mapreduce.framework.name","local");
        Job job = Job.getInstance(conf, "成绩分区");
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Student.class);

        job.setMapperClass(ScorePartitonerMapper.class);

        job.setPartitionerClass(ScorePartitoner.class);

        job.setReducerClass(ScoreReduer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        // 为什么要加这句话: 因为我们没有写reduce 但是默认还是会走reduce,所以需要添加reduce的数量为 0
        job.setNumReduceTasks(3);

        // 一般 不写,除非在yarn上运行,必须写,就是DriverClass
        job.setJarByClass(ScorePartitionerDriver.class);

        // 这个地方跟以前可以不一样
        FileInputFormat.addInputPath(job,new Path("../WordCount/mr07/input"));
        FileOutputFormat.setOutputPath(job,new Path("../WordCount/mr07/output"));

        // 每运行一会儿就追一下日志,查看进度的
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : -1);

    }
}

3、排序

默认排序:

案例之·斗鱼主播日志数据按照观众人数升序序排序

# 案例
用户id  观众人数
团团  300
小黑  200
哦吼  400
卢本伟  100
八戒  250
悟空  100
唐僧  100

# 期望结果
卢本伟  100
悟空  100
唐僧  100
小黑  200
八戒  250
团团  300
哦吼  400

提示:

● 默认排序规则
1. 默认排序调用mapper输出key的compareTo方法比较大小,决定排序规则。 
2. 默认升序。
所以以上这个题目,谁是Key,谁是Value呢?

参考答案:

package com.bigdata.paixu01;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

class PaiXuMapper extends Mapper<LongWritable, Text, IntWritable,Text>{

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String s = value.toString();
        String[] split = s.split("\\s+");
        int visitNum = Integer.valueOf(split[1]);
        String name = split[0];
        context.write(new IntWritable(visitNum),new Text(name));
    }
}
public class PaiXu01 {

    public static void main(String[] args) throws Exception {

        Configuration configuration = new Configuration();
        // 使用本地的文件系统,而不是hdfs
        configuration.set("fs.defaultFS","file:///");
        // 使用本地的资源(CPU,内存等), 也可以使用yarn平台跑任务
        configuration.set("mapreduce.framework.name","local");
        Job job = Job.getInstance(configuration, "排序01");

        // 指定 map
        job.setMapperClass(PaiXuMapper.class);
        // hello 1
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(Text.class);

        // 此处也可以使用绝对路径
        FileInputFormat.setInputPaths(job,"../WordCount/mr04/input/");
        FileOutputFormat.setOutputPath(job,new Path("../WordCount/mr04/output"));

        boolean result = job.waitForCompletion(true);

        // 返回结果如果为true表示任务成功了,正常退出,否则非正常退出
        System.exit(result?0:-1);
    }
}

案例之·斗鱼主播日志数据按照观众人数降序排序?

# 自定义排序
# 案例
团团  300
小黑  200
哦吼  400
卢本伟  100
八戒  250
悟空  100
唐僧  100


# 期望
哦吼  400
团团  300
八戒  250
小黑  200
卢本伟  100
悟空  100
唐僧  100

关键代码提示:

# 1. 需要自定义Mapper输出的key的类型,实现WritableComparable接口
# 2. 实现compareTo方法。
# 3. 补齐write和readField的序列化相关方法 
package com.bigdata.paixu01;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class IntWritable2 implements WritableComparable<IntWritable2> {

    private int value;

    public IntWritable2() {}

    public IntWritable2(int value) { set(value); }

    /** Set the value of this IntWritable. */
    public void set(int value) { this.value = value; }

    /** Return the value of this IntWritable. */
    public int get() { return value; }

    // 这个方法别人忙你调用的,只要存在排序的地方就会调用这个方法。
    // java 基础  比如各种排序方法  Collections.sort(list)
    @Override
    public int compareTo(IntWritable2 o) {

        int thisValue = this.value;
        int thatValue = o.get();
        return (thisValue > thatValue ? -1 : (thisValue==thatValue ? 0 : 1));

    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeInt(value);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        value = in.readInt();
    }

    @Override
    public String toString() {
        return value+"";
    }
}

测试代码:
package com.bigdata.paixu01;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

class PaiXuMapper extends Mapper<LongWritable, Text, IntWritable,Text>{

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String s = value.toString();
        String[] split = s.split("\\s+");
        int visitNum = Integer.valueOf(split[1]);
        String name = split[0];
        context.write(new IntWritable(visitNum),new Text(name));
    }
}

class PaiXuMapper2 extends Mapper<LongWritable, Text, IntWritable2,Text>{

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String s = value.toString();
        String[] split = s.split("\\s+");
        int visitNum = Integer.valueOf(split[1]);
        String name = split[0];
        context.write(new IntWritable2(visitNum),new Text(name));
    }
}
public class PaiXu01 {

    public static void main(String[] args) throws Exception {

        Configuration configuration = new Configuration();
        // 使用本地的文件系统,而不是hdfs
        configuration.set("fs.defaultFS","file:///");
        // 使用本地的资源(CPU,内存等), 也可以使用yarn平台跑任务
        configuration.set("mapreduce.framework.name","local");
        Job job = Job.getInstance(configuration, "排序01");

        // 指定 map
        job.setMapperClass(PaiXuMapper2.class);
        // hello 1
        job.setMapOutputKeyClass(IntWritable2.class);
        job.setMapOutputValueClass(Text.class);

        // 此处也可以使用绝对路径
        FileInputFormat.setInputPaths(job,"../WordCount/mr04/input/");
        FileOutputFormat.setOutputPath(job,new Path("../WordCount/mr04/output2"));

        boolean result = job.waitForCompletion(true);

        // 返回结果如果为true表示任务成功了,正常退出,否则非正常退出
        System.exit(result?0:-1);
    }
}
public class WatcherWritable implements WritableComparable<WatcherWritable> {
    private int watcher;
    /**
     * 在排序会调用,
     */
    public int compareTo(WatcherWritable o) {
        //写法2:降序
        return o.watcher - this.watcher;
        // 写法1:降序
        /*
        if(this.watcher>o.watcher){
            return -1;
        }else if (this.watcher == o.watcher){
            return 0;
        }else{
            return 1;
        }*/
    }
    ...
    // 省略构造方法,get set方法,序列化方法和tostring
}

优化一下:

我们的答案是数字在前,名字在后,跟人家的要求稍微不一样。
思路: 文件的输出格式是由  Key  和Value决定的。
我们可以将key输出的内容多一些,value就不输出了,不输出使用NullWritable代替。
package com.bigdata.paixu01;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

class PaiXuMapper extends Mapper<LongWritable, Text, IntWritable,Text>{

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String s = value.toString();
        String[] split = s.split("\\s+");
        int visitNum = Integer.valueOf(split[1]);
        String name = split[0];
        context.write(new IntWritable(visitNum),new Text(name));
    }
}

class PaiXuMapper2 extends Mapper<LongWritable, Text, IntWritable2,NullWritable>{

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String s = value.toString();
        String[] split = s.split("\\s+");
        int visitNum = Integer.valueOf(split[1]);
        String name = split[0];
        context.write(new IntWritable2(visitNum,name), NullWritable.get());
    }
}
public class PaiXu01 {

    public static void main(String[] args) throws Exception {

        Configuration configuration = new Configuration();
        // 使用本地的文件系统,而不是hdfs
        configuration.set("fs.defaultFS","file:///");
        // 使用本地的资源(CPU,内存等), 也可以使用yarn平台跑任务
        configuration.set("mapreduce.framework.name","local");
        Job job = Job.getInstance(configuration, "排序01");

        // 指定 map
        job.setMapperClass(PaiXuMapper2.class);
        // hello 1
        job.setMapOutputKeyClass(IntWritable2.class);
        job.setMapOutputValueClass(NullWritable.class);

        // 此处也可以使用绝对路径
        FileInputFormat.setInputPaths(job,"../WordCount/mr04/input/");
        FileOutputFormat.setOutputPath(job,new Path("../WordCount/mr04/output2"));

        boolean result = job.waitForCompletion(true);

        // 返回结果如果为true表示任务成功了,正常退出,否则非正常退出
        System.exit(result?0:-1);
    }
}


package com.bigdata.paixu01;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class IntWritable2 implements WritableComparable<IntWritable2> {

    private int value;
    private String name;

    public IntWritable2() {}

    public IntWritable2(int value,String name) { set(value,name); }

    /** Set the value of this IntWritable. */
    public void set(int value,String name ) {
        this.value = value;
        this.name = name;
    }

    /** Return the value of this IntWritable. */
    public int get() { return value; }

    // 这个方法别人忙你调用的,只要存在排序的地方就会调用这个方法。
    // java 基础  比如各种排序方法  Collections.sort(list)
    @Override
    public int compareTo(IntWritable2 o) {

        int thisValue = this.value;
        int thatValue = o.get();
        return (thisValue > thatValue ? -1 : (thisValue==thatValue ? 0 : 1));

    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeInt(value);
        out.writeUTF(name);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        value = in.readInt();
        name = in.readUTF();
    }

    @Override
    public String toString() {
        return name+" "+value;
    }
}

案例之·主播数据按照观众人数降序排序,如果观众人数相同,按照直播时长降序

# 案例数据
用户id  观众人数  直播时长
团团  300  1000
小黑  200  2000
哦吼  400  7000
卢本伟  100  6000
八戒  250  5000
悟空  100  4000
唐僧  100  3000

# 期望结果
哦吼  400  7000
团团  300  1000
八戒  250  5000
小黑  200  2000
卢本伟  100  6000
悟空  100  4000
唐僧  100  3000

关键代码:

public class PlayWritable implements WritableComparable<PlayWritable> {
    private int viewer;
    private int length;

    /**
     * 按照viewer降序,如果viewer相同,按照length降序
     * @param o
     * @return
     */
    public int compareTo(PlayWritable o) {
        if(this.viewer != o.viewer){
            return o.viewer - this.viewer;
        }else{
            return o.length - this.length;
        }
    }
    
    
    // 省略构造方法,get set方法,序列化方法和tostring
    ...
}
规律:
1)  排序是按照Key值排序
2)mapper 是可以单独使用的,假如只有mapper ,这个时候 set ReducerNum 为 0
3) 不管是mapper输出还是reducer输出,输出的格式是按照 Key的输出+ Value输出。Key的输出和Value的输出格式是按照这个类的 toString方法格式输出的。假如只想输出key ,就将value设置为 NullWritable,假如只想输出value,Key的类型就是NullWritable。
4)假如你有分区,分区的泛型,跟mapper的输出照应。

实例代码:

package com.bigdata.paixu01;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

class PaiXuMapper3 extends Mapper<LongWritable, Text, IntWritable2,NullWritable>{

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String s = value.toString();
        String[] split = s.split("\\s+");
        int visitNum = Integer.valueOf(split[1]);
        int shiChang = Integer.valueOf(split[2]);
        String name = split[0];
        context.write(new IntWritable2(visitNum,name,shiChang), NullWritable.get());
    }
}
package com.bigdata.paixu01;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class IntWritable2 implements WritableComparable<IntWritable2> {

    private int value;
    private String name;
    private int shiChang;

    public IntWritable2() {}

    public IntWritable2(int value,String name) { set(value,name); }

    public IntWritable2(int value,String name,int shiChang) { set(value,name,shiChang); }

    // 方法的重载
    public void set(int value) {
        this.value = value;
    }

    /** Set the value of this IntWritable. */
    public void set(int value,String name ) {
        this.value = value;
        this.name = name;
    }

    public void set(int value,String name,int shiChang ) {
        this.value = value;
        this.name = name;
        this.shiChang = shiChang;
    }

    /** Return the value of this IntWritable. */
    public int get() { return value; }

    // 这个方法别人忙你调用的,只要存在排序的地方就会调用这个方法。
    // java 基础  比如各种排序方法  Collections.sort(list)
    @Override
    public int compareTo(IntWritable2 o) {

        int thisValue = this.value;
        int thatValue = o.get();
        if(thisValue != thatValue){
            return thatValue - thisValue;
        }else {
            return o.shiChang - this.shiChang;
        }

    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeInt(value);
        out.writeUTF(name);
        out.writeInt(shiChang);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        value = in.readInt();
        name = in.readUTF();
        shiChang = in.readInt();
    }

    @Override
    public String toString() {
        return name+" "+value+" "+shiChang;
    }
}

 Driver:

public class PaiXu01 {

    public static void main(String[] args) throws Exception {

        Configuration configuration = new Configuration();
        // 使用本地的文件系统,而不是hdfs
        configuration.set("fs.defaultFS","file:///");
        // 使用本地的资源(CPU,内存等), 也可以使用yarn平台跑任务
        configuration.set("mapreduce.framework.name","local");
        Job job = Job.getInstance(configuration, "排序01");

        // 指定 map
        job.setMapperClass(PaiXuMapper3.class);
        // hello 1
        job.setMapOutputKeyClass(IntWritable2.class);
        job.setMapOutputValueClass(NullWritable.class);

        job.setNumReduceTasks(0);

        // 此处也可以使用绝对路径
        FileInputFormat.setInputPaths(job,"../WordCount/mr08/input/");
        FileOutputFormat.setOutputPath(job,new Path("../WordCount/mr08/output3"));

        boolean result = job.waitForCompletion(true);

        // 返回结果如果为true表示任务成功了,正常退出,否则非正常退出
        System.exit(result?0:-1);
    }
}

七、Main方法如何传参

以问题驱动学习:

假如你到公司中,如何自己学习?你们公司正在使用的技术是什么就学什么,特别大的技术,不要学。

 

package com.bigdata;

public class TestMain {

    /**
     *  args 是一个字符串数组,谁给它可以赋值呢?
     * @param args
     */
    public static void main(String[] args) {

        System.out.println("参数打印开始");
        for (String str:args) {
            System.out.println(str);
        }
        System.out.println("参数打印完毕");
    }
}
思考:假如你的这个代码不在idea中如何传参?
任何java代码都可以打成jar包,jar包中的文件如何运行?
java -jar  xxxx.jar  某个类的全路径  如果这个类有参数,直接跟在后面
java -jar  hello.jar   com.bigdata.TestMain 10 20 30

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

YuPangZa

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值