Hadoop03

最新推荐文章于 2022-04-24 16:21:00 发布

initializeliu

最新推荐文章于 2022-04-24 16:21:00 发布

阅读量119

点赞数

分类专栏：大数据分布式开发文章标签： Hadoop

本文链接：https://blog.csdn.net/weixin_42581821/article/details/95451921

版权

大数据分布式开发专栏收录该内容

32 篇文章 1 订阅

订阅专栏

文章目录

MapReduce

MapReduce

mapreduce程序应该是在很多机器上并行启动，而且先执行map task，当众多的maptask都处理完自己的数据后，还需要启动众多的reduce task，这个过程如果用用户自己手动调度不太现实，需要一个自动化的调度平台——hadoop中就为运行mapreduce之类的分布式运算程序开发了一个自动化调度平台——YARN

安装yarn集群

yarn集群中有两个角色：
主节点：Resource Manager 1台
从节点：Node Manager N台
Resource Manager一般安装在一台专门的机器上,Node Manager应该与HDFS中的data node重叠在一起

** 系统配置文件/etc/profile **

#java
export JAVA_HOME=/appdata/jdk
export CLASSPATH=.:$JAVA_HOME/jre/lib/rt.jar:$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib/tools.jar
export PATH=.:$PATH:$JAVA_HOME/bin

#hadoop
export HADOOP_HOME=/appdata/hadoop
export PATH=.:$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib:$HADOOP_COMMON_LIB_NATIVE_DIR"

修改配置文件：
yarn-site.xml

<property>
	<name>yarn.resourcemanager.hostname</name>
	<value>n1</value>
</property>
<property>
	<name>yarn.nodemanager.aux-services</name>
	<value>mapreduce_shuffle</value>
</property>
<!-- 每个nodemanager可调用的内存-->
<property>
	<name>yarn.nodemanager.resource.memory-mb</name>
	<value>2048</value>
</property>
<!-- 每一个节点，虚拟cpu的个数 -->
<property>
	<name>yarn.nodemanager.resource.cpu-vcores</name>
	<value>2</value>
</property>

然后复制到每一台机器上

启动yarn集群：sbin/start-yarn.sh
停止：sbin/stop-yarn.sh

启动完成后，可以在windows上用浏览器访问resourcemanager的web端口：
http://n1:8088
在这里插入图片描述

看resource mananger是否认出了所有的node manager节点

mapreduce程序运行的三种模式

Windows本地客户端提交程序到yarn集群上面运行

wordcount程序整体运行流程示意图
map阶段：将每一行文本数据变成<单词,1>这样的kv数据
reduce阶段：将相同单词的一组kv数据进行聚合：累加所有的v
注意点：mapreduce程序中，
map阶段的进、出数据，
reduce阶段的进、出数据，
类型都应该是实现了HADOOP序列化框架的类型，
如：
String对应Text
Integer对应IntWritable
Long对应LongWritable
** Mapper代码 **

package com.initialize;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * KEYIN：是map task读取到的数据的key的类型，是一行的起始偏移量Long
 * VALUEIN:是map task读取到的数据的value的类型，是一行的内容String
 *
 * KEYOUT：是用户的自定义map方法要返回的结果kv数据的key的类型，在wordcount逻辑中，我们需要返回的是单词String
 * VALUEOUT：是用户的自定义map方法要返回的结果Kv数据的value的类型，在wordcount逻辑中，我们需要返回的是整数Integer
 *
 * 但是，在mapreduce中,map产出的数据需要传输给reduce,需要进行序列化和反序列化，而jdk中原生序列化机制产生的数据比较冗余，就会导致数据在mapreduce运行过程中传输效率地下
 * 所以，hadoop专门设计了自己的序列化机制，那么，mapreduce中传输的数据类型就必须要实现hadoop自己的序列化接口
 *
 * hadoop为jdk中的常用基本数据类型Long, String, Integer, Fload等数据类型封装了自己的实现了hadoop序列化接口的类型：LongWritable, Text, IntWritable, FloatWritable
 *
 */
public class WordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        //切单词
        String line = value.toString();
        String[] words = line.split(" ");
        for(String word : words){
            context.write(new Text(word), new IntWritable(1));
        }
    }
}

** reduce代码 **

package com.initialize;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.Iterator;

public class WordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException{
        int count = 0;
        Iterator<IntWritable> iterator = values.iterator();
        while(iterator.hasNext()){
            IntWritable value = iterator.next();
            count += value.get();
        }
        context.write(key, new IntWritable(count));
    }
}

** 客户端程序**

package com.initialize;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.net.URI;

/**
 * 用于提交mapreduce job的客户端程序
 * 功能：
 * 1.封装本次job运行时所需要的必要参数
 * 2.跟yarn进行交互，将mapreduce程序成功的启动，运行
 */
public class JobSubmitter {
    public static void main(String[] args) throws Exception {

        //在代码中设置JVM系统参数，用于给job对象来获取访问HDFS的用户身份
        System.setProperty("HADOOP_USER_NAME", "lys");

        Configuration conf = new Configuration();
        //设置job运行时，要访问的默认文件系统
        conf.set("fs.defaultFS", "hdfs://n1:9000");
        //设置job提交到那里运行
        conf.set("mapreduce.framework.name", "yarn");
        conf.set("yarn.resourcemanager.hostname", "n1");
        //如果要从windows系统上运行这个job提交客户端程序，则需要加这个跨平台提交的参数
        conf.set("mapreduce.app-submission.cross-platform", "true");

        Job job = Job.getInstance(conf);

        //1.封装参数：jar包所在的位置
        job.setJar("D:/wc.jar");//将jar包中相关程序封装成task到yarn集群上运行
        //job.setJarByClass(JobSubmitter.class);

        //2.封装参数：本次job所要调用的Mapper实现类，Reducer实现类
        job.setMapperClass(WordcountMapper.class);
        job.setReducerClass(WordcountReducer.class);

        //3.封装参数：本次job的Mapper实现类，Reducer实现类产生的结果数据的key,value类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);


        Path output = new Path("/wordcount/output");
        FileSystem fs = FileSystem.get(new URI("hdfs://n1:9000"), conf, "lys");
        if(fs.exists(output)){
            fs.delete(output, true);
        }

        //4.封装参数:本次job要处理的输入数据集所在路径，最终结果的输出路径
        FileInputFormat.setInputPaths(job, new Path("/wordcount/input"));
        FileOutputFormat.setOutputPath(job, output); //注意：输出路径必须不存在

        //5.疯传参数：要想启动的reduce task的数量
        job.setNumReduceTasks(2);

        //6.提交job给yarn
        boolean res = job.waitForCompletion(true);
        System.exit(res?0:-1);
    }
}

运行结果可在hadoop的dfs集群中查看。

在yarn集群上面提交mapreduce程序

$java -cp mapreduce24-1.0-SNAPSHOT.jar com.initialize.JobSubmitter2
//局部jar包会出现，部分类找不到的异常。

$hadoop jar mapreduce24-1.0-SNAPSHOT.jar com.initialize.JobSubmitter2
//hadoop jar 会将本地的Hadoop的jar包加载的class路径下。
//在本地模式运行，默认加载配置文件，mapred-define中有mapreduce.framework.name=local,默认值。

$hadoop jar mapreduce24-1.0-SNAPSHOT.jar com.initialize.JobSubmitter2
//在mrperd-site.xml配置过mapreduce.framework.name=yarn后，会将程序提交到集群中运行。

package com.initialize;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;

/**
 * KEYIN：是map task读取到的数据的key的类型，是一行的起始偏移量Long
 * VALUEIN:是map task读取到的数据的value的类型，是一行的内容String
 *
 * KEYOUT：是用户的自定义map方法要返回的结果kv数据的key的类型，在wordcount逻辑中，我们需要返回的是单词String
 * VALUEOUT：是用户的自定义map方法要返回的结果Kv数据的value的类型，在wordcount逻辑中，我们需要返回的是整数Integer
 *
 * 但是，在mapreduce中,map产出的数据需要传输给reduce,需要进行序列化和反序列化，而jdk中原生序列化机制产生的数据比较冗余，就会导致数据在mapreduce运行过程中传输效率地下
 * 所以，hadoop专门设计了自己的序列化机制，那么，mapreduce中传输的数据类型就必须要实现hadoop自己的序列化接口
 *
 * hadoop为jdk中的常用基本数据类型Long, String, Integer, Fload等数据类型封装了自己的实现了hadoop序列化接口的类型：LongWritable, Text, IntWritable, FloatWritable
 *
 */
public class WordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //切单词
        String line = value.toString();
        String[] words = line.split(" ");
        for(String word : words){
            context.write(new Text(word), new IntWritable(1));
        }
    }
}

package com.initialize;


import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.Iterator;

public class WordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException{

        int count = 0;
        Iterator<IntWritable> iterator = values.iterator();
        while(iterator.hasNext()){
            IntWritable value = iterator.next();
            count += value.get();
        }

        context.write(key, new IntWritable(count));
    }
}

package com.initialize;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * 如果要在hadoop集群的某台机器上启动这个job提交客户端的话
 * conf里面就不需要指定fs.defaultFS  mapreduce.framework.name
 *
 * 因为在集群机器上hadoop jar xxx.jar com.initialize.JobSubmitter2 命令来启动客户端main方法时，
 *      hadoop jar这个命令会将所在机器上的hadoop安装目录中的jar包和配置文件加入到运行的classpath中
 *
 *      那么，我们的客户端main方法中的new Configuration()语句就会加载classpath中的配置文件，自然就有了
 *      fs.defaultFS和mapreduce.framework.name和yarn.resourcemanager.hostname 这些参数配置
 */

public class JobSubmitter2 {

    public static void main(String[] args) throws Exception {

        //没有指定默认文件系统
        //没有指定mapreduce-job提交到那运行

        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);

        job.setJarByClass(JobSubmitter2.class);

        job.setMapperClass(WordcountMapper.class);
        job.setReducerClass(WordcountReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path("/wordcount/input"));
        FileOutputFormat.setOutputPath(job, new Path("/wordcount/output"));

        job.setNumReduceTasks(3);

        boolean res = job.waitForCompletion(true);
        System.exit(res?0:1);

    }
}

运行
在这里插入图片描述

** 运行结果截图**
在这里插入图片描述

本地模式运行

本地模式使用与debug调试

package com.initialize;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class JobSubmitterWindowLocal {

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        //没指定默认的文件系统
        //conf.set("fs.defaultFS", "file:///");//默认配置
        //conf.set("mapreduce.framework.name", "local");//默认配置

        Job job = Job.getInstance(conf);

        job.setJarByClass(JobSubmitterWindowLocal.class);

        job.setMapperClass(WordcountMapper.class);
        job.setReducerClass(WordcountReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path("D:/input"));
        FileOutputFormat.setOutputPath(job, new Path("D:/output"));

        job.setNumReduceTasks(3);

        boolean res = job.waitForCompletion(true);
        System.exit(res?0:1);
        
    }
}

运行结果
在这里插入图片描述

mapreduce编程模型和具体实现框架之间的概念关系

在这里插入图片描述

案例

流量统计

计算日志中，每一个手机号码使用的流量的总和。

package com.initialize.flow;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 * 本案例的功能：演示自定义数据类型如何实现hadoop的序列化接口
 * 1.该类一定要保留空参构造函数
 * 2.write方法中输出字段二进制数据的顺序要与readFields方法读取数据的顺序一致
 *
 */

public class FlowBean implements Writable {

    private int upFlow;
    private int dFlow;
    private String phone;
    private int amountFlow;

    public FlowBean(){}

    public FlowBean(String phone, int upFlow, int dFlow){
        this.phone = phone;
        this.upFlow = upFlow;
        this.dFlow = dFlow;
        this.amountFlow = upFlow + dFlow;
    }

    /**
     * hadoop系统在序列化该类的对象要调用的方法
     * @param out
     * @throws IOException
     */
    @Override
    public void write(DataOutput out) throws IOException {

        out.writeInt(upFlow);
        out.writeUTF(phone);
        out.writeInt(dFlow);
        out.writeInt(amountFlow);
    }

    /**
     * hadoop系统在反序列化该类的对象时要调用的方法
     * @throws IOException
     */
    @Override
    public void readFields(DataInput in) throws IOException {

        this.upFlow = in.readInt();
        this.phone = in.readUTF();
        this.dFlow = in.readInt();
        this.amountFlow = in.readInt();
    }

    @Override
    public String toString() {
        return "FlowBean{" +
                "upFlow=" + upFlow +
                ", dFlow=" + dFlow +
                ", phone='" + phone + '\'' +
                ", amountFlow=" + amountFlow +
                '}';
    }

    public int getUpFlow() {
        return upFlow;
    }

    public void setUpFlow(int upFlow) {
        this.upFlow = upFlow;
    }

    public int getdFlow() {
        return dFlow;
    }

    public void setdFlow(int dFlow) {
        this.dFlow = dFlow;
    }

    public String getPhone() {
        return phone;
    }

    public void setPhone(String phone) {
        this.phone = phone;
    }

    public int getAmountFlow() {
        return amountFlow;
    }

    public void setAmountFlow(int amountFlow) {
        this.amountFlow = amountFlow;
    }
}

package com.initialize.flow;


import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class FlowCountMapper extends Mapper<LongWritable, Text, Text, FlowBean> {

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        String line = value.toString();
        String[] fields = line.split("\t");

        String phone = fields[1];

        int upFlow = Integer.parseInt(fields[fields.length-3]);
        int dFlow = Integer.parseInt(fields[fields.length-2]);

        context.write(new Text(phone), new FlowBean(phone, upFlow, dFlow));
    }
}
package com.initialize.flow;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class FlowCountReducer extends Reducer<Text, FlowBean, Text, FlowBean> {

    /**
     * key:是某个手机号
     * values:是这个手机号所产生的所有访问记录中的流量数据
     *
     * <135,flowBean1><135,flowBean2><135.flowBean3><135.flowBean4>
     */
    @Override
    protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {

        int upSum = 0;
        int dSum = 0;

        for(FlowBean value:values){
            upSum += value.getUpFlow();
            dSum += value.getdFlow();
        }

        context.write(key, new FlowBean(key.toString(), upSum, dSum));
    }
}

package com.initialize.flow;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class JobSubmitter {
    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);

        job.setJarByClass(JobSubmitter.class);

        job.setMapperClass(FlowCountMapper.class);
        job.setReducerClass(FlowCountReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(FlowBean.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(FlowBean.class);


        FileInputFormat.setInputPaths(job, new Path("C:\\Users\\Desktop\\input"));
        FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\Desktop\\output"));

        job.waitForCompletion(true);
    }
}

运算数据：

1363157985066 	13726230503	00-FD-07-A4-72-B8:CMCC	120.196.100.82	i02.c.aliimg.com		24	27	2481	24681	200
1363157995052 	13826544101	5C-0E-8B-C7-F1-E0:CMCC	120.197.40.4			4	0	264	0	200
1363157991076 	13926435656	20-10-7A-28-CC-0A:CMCC	120.196.100.99			2	4	132	1512	200
1363154400022 	13926251106	5C-0E-8B-8B-B1-50:CMCC	120.197.40.4			4	0	240	0	200
1363157993044 	18211575961	94-71-AC-CD-E6-18:CMCC-EASY	120.196.100.99	iface.qiyi.com	视频网站	15	12	1527	2106	200
1363157995074 	84138413	5C-0E-8B-8C-E8-20:7DaysInn	120.197.40.4	122.72.52.12		20	16	4116	1432	200
1363157993055 	13560439658	C4-17-FE-BA-DE-D9:CMCC	120.196.100.99			18	15	1116	954	200
1363157995033 	15920133257	5C-0E-8B-C7-BA-20:CMCC	120.197.40.4	sug.so.360.cn	信息安全	20	20	3156	2936	200
1363157983019 	13719199419	68-A1-B7-03-07-B1:CMCC-EASY	120.196.100.82			4	0	240	0	200
1363157984041 	13660577991	5C-0E-8B-92-5C-20:CMCC-EASY	120.197.40.4	s19.cnzz.com	站点统计	24	9	6960	690	200
1363157973098 	15013685858	5C-0E-8B-C7-F7-90:CMCC	120.197.40.4	rank.ie.sogou.com	搜索引擎	28	27	3659	3538	200
1363157986029 	15989002119	E8-99-C4-4E-93-E0:CMCC-EASY	120.196.100.99	www.umeng.com	站点统计	3	3	1938	180	200
1363157992093 	13560439658	C4-17-FE-BA-DE-D9:CMCC	120.196.100.99			15	9	918	4938	200
1363157986041 	13480253104	5C-0E-8B-C7-FC-80:CMCC-EASY	120.197.40.4			3	3	180	180	200
1363157984040 	13602846565	5C-0E-8B-8B-B6-00:CMCC	120.197.40.4	2052.flash2-http.qq.com	综合门户	15	12	1938	2910	200
1363157995093 	13922314466	00-FD-07-A2-EC-BA:CMCC	120.196.100.82	img.qfc.cn		12	12	3008	3720	200
1363157982040 	13502468823	5C-0A-5B-6A-0B-D4:CMCC-EASY	120.196.100.99	y0.ifengimg.com	综合门户	57	102	7335	110349	200
1363157986072 	18320173382	84-25-DB-4F-10-1A:CMCC-EASY	120.196.100.99	input.shouji.sogou.com	搜索引擎	21	18	9531	2412	200
1363157990043 	13925057413	00-1F-64-E1-E6-9A:CMCC	120.196.100.55	t3.baidu.com	搜索引擎	69	63	11058	48243	200
1363157988072 	13760778710	00-FD-07-A4-7B-08:CMCC	120.196.100.82			2	2	120	120	200
1363157985066 	13726238888	00-FD-07-A4-72-B8:CMCC	120.196.100.82	i02.c.aliimg.com		24	27	2481	24681	200
1363157993055 	13560436666	C4-17-FE-BA-DE-D9:CMCC	120.196.100.99			18	15	1116	954	200

运算结果：
在这里插入图片描述

自定义分区器

package com.initialize.flow;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

import java.util.HashMap;


public class ProvincePartitioner extends Partitioner<Text, FlowBean> {
    static HashMap<String, Integer> codeMap = new HashMap<>();
    static{
        codeMap.put("135", 0);
        codeMap.put("136", 1);
        codeMap.put("137", 2);
        codeMap.put("138", 3);
        codeMap.put("139", 4);
    }
    @Override
    public int getPartition(Text key, FlowBean value, int numPartitions) {

        Integer code = codeMap.get(key.toString().substring(0, 3));
        return code == null?5:code;
    }
}

package com.initialize.flow;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class JobSubmitter {
    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);

        job.setJarByClass(JobSubmitter.class);

        job.setMapperClass(FlowCountMapper.class);
        job.setReducerClass(FlowCountReducer.class);

        //设置参数：maptask在做数据分区时，用哪个分区逻辑类（如果不指定，默认HashPartitioner）
        job.setPartitionerClass(ProvincePartitioner.class);
        //由于我们的ProvincePartitioner可能会产生6种分区号，所以，需要6个reduce task来接收
        job.setNumReduceTasks(6);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(FlowBean.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(FlowBean.class);


        FileInputFormat.setInputPaths(job, new Path("C:\\Users\\刘元帅\\Desktop\\input"));
        FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\刘元帅\\Desktop\\output"));

        job.waitForCompletion(true);
    }
}

结果：产生六个分区结果文件
在这里插入图片描述

DataOutputStream类write()和writeUTF()

package com.initialize.flow;


import java.io.DataOutputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;

public class DataOutputstreamTest {

    public static void main(String[] args) throws Exception {

        DataOutputStream dos = new DataOutputStream(new FileOutputStream("D:/a.dat"));

        dos.write("我爱你".getBytes("utf-8"));

        dos.close();

        DataOutputStream dos2 = new DataOutputStream(new FileOutputStream("D:/b.dat"));

        dos2.writeUTF("我爱你");

        dos2.close();
    }

在这里插入图片描述

** 解释**：writeUTF方法在保存数据时，会在开头加两个字节，用于表示该字符串有多长。

TreeMap

package com.initialize.flow;


import java.util.Comparator;

import java.util.Map;
import java.util.Set;
import java.util.TreeMap;

public class TreeMapTest {

    public static void main(String[] args){

        TreeMap<FlowBean, String> tm1 = new TreeMap<FlowBean, String>(new Comparator<FlowBean>() {
            @Override
            public int compare(FlowBean o1, FlowBean o2) {
                if(o2.getAmountFlow() - o1.getAmountFlow() == 0){
                    return o1.getPhone().compareTo(o2.getPhone());
                }
                return o2.getAmountFlow() -o1.getAmountFlow();
            }
        });

        /*tm1.put("a", 2);
        tm1.put("b", 1);
        tm1.put("aa", 11);
        tm1.put("ab", 1);

        */

        FlowBean b1 = new FlowBean("1367788", 500, 300);
        FlowBean b2 = new FlowBean("1367766", 400, 200);
        FlowBean b3 = new FlowBean("1367755", 600, 400);
        FlowBean b4 = new FlowBean("1367744", 300, 500);

        tm1.put(b1, null);
        tm1.put(b2, null);
        tm1.put(b3, null);
        tm1.put(b4, null);

        Set<Map.Entry<FlowBean, String>> entrySet = tm1.entrySet();
        for(Map.Entry<FlowBean, String> entry : entrySet){
            System.out.println(entry.getKey() + ", " + entry.getValue());
        }
    }
}

结果
在这里插入图片描述

页面的访问次数

** mapper**

package com.initialize.page.topn;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;


public class PageTopnMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        String[] split = line.split(" ");
        context.write(new Text(split[1]), new IntWritable(1));
    }
}

** reducer**

package com.initialize.page.topn;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;


public class PageTopnReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
    TreeMap<PageCount, Object> treeMap = new TreeMap<>();

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        int count = 0;
        for(IntWritable value : values){
            count += value.get();
        }
        PageCount pageCount = new PageCount();
        pageCount.set(key.toString(), count);

        treeMap.put(pageCount, null);
    }

    /**
     * 该方法会在每一批key相同的数据处理完以后，调用一次。每个task调用一次
     * @param context
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    protected void cleanup(Context context) throws IOException, InterruptedException {
        Configuration conf = context.getConfiguration();
        //如果取top.n的值为空，则返回5
        int topn = conf.getInt("top.n", 5);

        Set<Map.Entry<PageCount, Object>> entrySet = treeMap.entrySet();
        int i=0;

        for(Map.Entry<PageCount, Object> entry :entrySet){
            context.write(new Text(entry.getKey().getPage()), new IntWritable(entry.getKey().getCount()));
            i++;
            if(i==topn) return;
        }

    }
}

** JobSubmitter **

package com.initialize.page.topn;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.util.Properties;

public class JobSubmitter {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        /**
         * 通过加载classpath下的*-site.xml文件解析参数
         */
        Configuration conf = new Configuration();//默认加载hadoop相关文件：core-site.xml, dfs-site.xml, yarn-site.xml, mapred-site.xml
        conf.addResource("xx-oo.xml");

        /**
         * 通过代码设置参数
         */
        //conf.setInt("top.n", 3);
        //conf.setInt("top.n", Integer.parseInt(args[0]));

        /**
         * 通过属性配置文件获取参数
         */
        /*Properties props = new Properties();
        props.load(JobSubmitter.class.getClassLoader().getResourceAsStream("topn.properties"));
        conf.setInt("top.n", Integer.parseInt(props.getProperty("top.n")));*/

        Job job = Job.getInstance(conf);

        job.setJarByClass(JobSubmitter.class);

        job.setMapperClass(PageTopnMapper.class);
        job.setReducerClass(PageTopnReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path("C:\\Users\\刘元帅\\Desktop\\request.dat"));
        FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\刘元帅\\Desktop\\output"));

        job.waitForCompletion(true);
    }
}

** pageCount**

package com.initialize.page.topn;


public class PageCount implements Comparable<PageCount>{

    private String page;
    private int count;

    public void set(String page, int count){
        this.page = page;
        this.count = count;
    }

    public String getPage() {
        return page;
    }

    public void setPage(String page) {
        this.page = page;
    }

    public int getCount() {
        return count;
    }

    public void setCount(int count) {
        this.count = count;
    }


    @Override
    public int compareTo(PageCount o) {
        return o.getCount() - this.count == 0?this.page.compareTo(o.getPage()):o.getCount()-this.count;
    }
}

相关配置文件位置：
在这里插入图片描述
** oo-xx.xml**

<configuration>
    <property>
        <name>top.n</name>
        <value>6</value>
    </property>
    <property>
        <name>mygirlfriend</name>
        <value>angelababy</value>
    </property>
</configuration>

** topn.properties **

top.n=5

使用Reduce默认的排序方法

利用Reduce默认的排序方法,显示网站访问的次数由高到低排序。进行两次mapreduce计算。

package com.initialize.page.conut.sort;


import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class PageCount implements WritableComparable<PageCount> {

    private String page;
    private int count;

    public void set(String page, int count){
        this.page = page;
        this.count = count;
    }

    @Override
    public String toString() {
        return this.page + "," + this.count;
    }

    public String getPage() {
        return page;
    }

    public void setPage(String page) {
        this.page = page;
    }

    public int getCount() {
        return count;
    }

    public void setCount(int count) {
        this.count = count;
    }

    @Override
    public int compareTo(PageCount o) {

        return o.getCount()-this.count == 0?this.page.compareTo(o.page):o.getCount()-this.getCount();
    }

    @Override
    public void write(DataOutput out) throws IOException {

        out.writeUTF(this.page);
        out.writeInt(this.count);
    }

    @Override
    public void readFields(DataInput in) throws IOException {

        this.page = in.readUTF();
        this.count = in.readInt();
    }
}

package com.initialize.page.conut.sort;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class PageCountStep1 {

    public static class PageCountStep1Mapper extends Mapper<LongWritable, Text, Text, IntWritable>{
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String line = value.toString();
            String[] split = line.split(" ");
            context.write(new Text(split[1]), new IntWritable(1));
        }
    }

    public static class PageCountStep1Reducer extends Reducer<Text, IntWritable, Text, IntWritable> {

        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int count = 0;
            for(IntWritable v : values){
                count += v.get();
            }
            context.write(key, new IntWritable(count));
        }
    }
    
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);

        job.setJarByClass(PageCountStep1.class);

        job.setMapperClass(PageCountStep1Mapper.class);
        job.setReducerClass(PageCountStep1Reducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path("C:\\Users\\刘元帅\\Desktop\\request.dat"));
        FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\刘元帅\\Desktop\\output"));

        job.setNumReduceTasks(3);
        job.waitForCompletion(true);

    }
}

第一次计算产生的结果：产生了三个分区文件
在这里插入图片描述

package com.initialize.page.conut.sort;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


import java.io.IOException;

public class PageCountStep2 {

    public static class PageCountStep2Mapper extends Mapper<LongWritable, Text, PageCount, NullWritable>{
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            String[] split = value.toString().split("\t");
            PageCount pageCount = new PageCount();
            pageCount.set(split[0], Integer.parseInt(split[1]));
            context.write(pageCount, NullWritable.get());
        }
    }

    public static class PageCountStep2Reducer extends Reducer<PageCount, NullWritable, PageCount, NullWritable> {
        @Override
        protected void reduce(PageCount key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
            context.write(key, NullWritable.get());
        }
    }


    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);

        job.setJarByClass(PageCountStep2.class);

        job.setMapperClass(PageCountStep2Mapper.class);
        job.setReducerClass(PageCountStep2Reducer.class);

        job.setMapOutputKeyClass(PageCount.class);
        job.setMapOutputValueClass(NullWritable.class);
        job.setOutputKeyClass(PageCount.class);
        job.setOutputValueClass(NullWritable.class);

        FileInputFormat.setInputPaths(job, new Path("C:\\Users\\刘元帅\\Desktop\\output"));
        FileOutputFormat.setOutputPath(job, new Path("C:\\Users\\刘元帅\\Desktop\\sort_put"));

        job.setNumReduceTasks(1);
        job.waitForCompletion(true);
    }
}