MapReduce学习笔记（2）—— 流量统计（2）

最新推荐文章于 2023-05-12 18:07:15 发布

一角残叶

最新推荐文章于 2023-05-12 18:07:15 发布

阅读量317

点赞数

分类专栏： # MapReduce学习笔记文章标签： mapreduce 按照省份统计流量

本文链接：https://blog.csdn.net/u012292754/article/details/81477540

版权

MapReduce学习笔记专栏收录该内容

11 篇文章 1 订阅

订阅专栏

1 将统计结果按照手机归属地输出到不同的文件

map 读一行，切分字段；抽取手机号，上行流量，下行流量；context.write(手机号，bean)
map 输出的数据分成6个区，重写 partitioner ，让相同的归属地号码返回相同的分区号 int
6个省，有6个 reduce task; reduce 拿到一个号码所有数据，遍历，累加，输出。

2 源码

package provinceflow;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class FlowBean implements Writable {
    private long upFlow;
    private long dFlow;
    private long sumFlow;


    //反序列化时，需要反射调用空参构造函数
    public FlowBean() {
    }

    public FlowBean(long upFlow, long dFlow) {
        this.upFlow = upFlow;
        this.dFlow = dFlow;
        this.sumFlow = upFlow + dFlow;
    }

    public long getUpFlow() {
        return upFlow;
    }

    public void setUpFlow(long upFlow) {
        this.upFlow = upFlow;
    }

    public long getdFlow() {
        return dFlow;
    }

    public void setdFlow(long dFlow) {
        this.dFlow = dFlow;
    }

    public long getSumFlow() {
        return sumFlow;
    }

    public void setSumFlow(long sumFlow) {
        this.sumFlow = sumFlow;
    }

    /*
     * 序列化方法
     * */
    @Override
    public void write(DataOutput out) throws IOException {
        out.writeLong(upFlow);
        out.writeLong(dFlow);
        out.writeLong(sumFlow);

    }

    /*
     * 反序列化方法
     * 反序列化的顺序和序列化的顺序一样
     * */
    @Override
    public void readFields(DataInput in) throws IOException {
        upFlow = in.readLong();
        dFlow = in.readLong();
        sumFlow=in.readLong();
    }

    @Override
    public String toString() {
        return
                 upFlow +
                "\t" + dFlow +
                "\t" + sumFlow;
    }
}

package provinceflow;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

import java.util.HashMap;

/*
 * K,V 对应 map 输出的类型
 * */
public class ProvincePartitioner extends Partitioner<Text, FlowBean> {

    public static HashMap<String, Integer> provinceDict = new HashMap<>();

    static {
        provinceDict.put("136", 0);
        provinceDict.put("137", 1);
        provinceDict.put("138", 2);
        provinceDict.put("139", 3);
    }

    @Override
    public int getPartition(Text key, FlowBean flowBean, int i) {
        String prefix = key.toString().substring(0, 3);
        Integer provinceId = provinceDict.get(prefix);

        return provinceId == null ? 4 : provinceId;
    }
}

package provinceflow;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class FlowCount {

    static class FlowCountMapper extends Mapper<LongWritable, Text, Text, FlowBean> {
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            String line = value.toString();
            String[] fields = line.split("\t");

            //取出手机号
            String phoneNbr = fields[1];
            long upFlow = Long.parseLong(fields[fields.length - 3]);
            long dFlow = Long.parseLong(fields[fields.length - 2]);


            context.write(new Text(phoneNbr), new FlowBean(upFlow, dFlow));

        }
    }

    static class FlowCountReducer extends Reducer<Text, FlowBean, Text, FlowBean> {
        @Override
        protected void reduce(Text key, Iterable<FlowBean> values, Context context) throws IOException, InterruptedException {
            long sum_upFlow = 0;
            long sum_dFlow = 0;

            //遍历所有的bean,将其中的上行流量和下行流量分别累加
            for (FlowBean bean : values) {
                sum_upFlow += bean.getUpFlow();
                sum_dFlow += bean.getdFlow();
            }

            FlowBean resultBean =  new FlowBean(sum_upFlow,sum_upFlow);
            context.write(key,resultBean);
        }
    }


    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        Configuration conf = new Configuration();

        conf.set("mapreduce.framework.name","yarn");
        conf.set("yarn.resourcemanager.hostname","node1");

        Job job = Job.getInstance(conf);


        //job.setJar("/home/hadoop/wc.jar");
        job.setJarByClass(FlowCount.class);

        job.setMapperClass(FlowCountMapper.class);
        job.setReducerClass(FlowCountReducer.class);

        //指定自定义的数据分区器
        job.setPartitionerClass(ProvincePartitioner.class);

        //指定相应分区数量reducetask
        job.setNumReduceTasks(5);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(FlowBean.class);

        //指定最终输出的数据类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(FlowBean.class);


        //指定job的输入原始文件目录
        FileInputFormat.setInputPaths(job, new Path(args[1]));
        FileOutputFormat.setOutputPath(job, new Path(args[2]));


        //将job配置的参数，以及job所用的java类所在的jar包提交给yarn去运行
        //job.submit();
        boolean res = job.waitForCompletion(true);
        System.exit(res ? 0 : 1);
    }


}

3 打包成 jar ,上传到集群运行

[hadoop@node1 ~]$ hadoop jar provinceflow.jar provinceflow.FlowCount /flowsum/input /flowsum/provinceout
18/08/07 03:58:50 INFO client.RMProxy: Connecting to ResourceManager at node1/192.168.154.131:8032
18/08/07 03:58:51 WARN mapreduce.JobResourceUploader: Hadoop command-line option parsing not performed. Implement the Tool interface and execute your application with ToolRunner to remedy this.
18/08/07 03:58:52 INFO input.FileInputFormat: Total input paths to process : 1
18/08/07 03:58:52 INFO mapreduce.JobSubmitter: number of splits:1
18/08/07 03:58:53 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1533604256702_0004
18/08/07 03:58:53 INFO impl.YarnClientImpl: Submitted application application_1533604256702_0004
18/08/07 03:58:53 INFO mapreduce.Job: The url to track the job: http://node1:8088/proxy/application_1533604256702_0004/
18/08/07 03:58:53 INFO mapreduce.Job: Running job: job_1533604256702_0004
18/08/07 03:59:07 INFO mapreduce.Job: Job job_1533604256702_0004 running in uber mode : false
18/08/07 03:59:07 INFO mapreduce.Job:  map 0% reduce 0%
18/08/07 03:59:18 INFO mapreduce.Job:  map 100% reduce 0%
18/08/07 03:59:32 INFO mapreduce.Job:  map 100% reduce 20%
18/08/07 03:59:41 INFO mapreduce.Job:  map 100% reduce 40%
18/08/07 03:59:42 INFO mapreduce.Job:  map 100% reduce 60%
18/08/07 03:59:43 INFO mapreduce.Job:  map 100% reduce 100%
18/08/07 03:59:44 INFO mapreduce.Job: Job job_1533604256702_0004 completed successfully
18/08/07 03:59:44 INFO mapreduce.Job: Counters: 50
    File System Counters
        FILE: Number of bytes read=863
        FILE: Number of bytes written=738605
        FILE: Number of read operations=0
        FILE: Number of large read operations=0
        FILE: Number of write operations=0
        HDFS: Number of bytes read=2295
        HDFS: Number of bytes written=552
        HDFS: Number of read operations=18
        HDFS: Number of large read operations=0
        HDFS: Number of write operations=10
    Job Counters 
        Killed reduce tasks=1
        Launched map tasks=1
        Launched reduce tasks=5
        Data-local map tasks=1
        Total time spent by all maps in occupied slots (ms)=8208
        Total time spent by all reduces in occupied slots (ms)=97223
        Total time spent by all map tasks (ms)=8208
        Total time spent by all reduce tasks (ms)=97223
        Total vcore-milliseconds taken by all map tasks=8208
        Total vcore-milliseconds taken by all reduce tasks=97223
        Total megabyte-milliseconds taken by all map tasks=8404992
        Total megabyte-milliseconds taken by all reduce tasks=99556352
    Map-Reduce Framework
        Map input records=22
        Map output records=22
        Map output bytes=789
        Map output materialized bytes=863
        Input split bytes=105
        Combine input records=0
        Combine output records=0
        Reduce input groups=21
        Reduce shuffle bytes=863
        Reduce input records=22
        Reduce output records=21
        Spilled Records=44
        Shuffled Maps =5
        Failed Shuffles=0
        Merged Map outputs=5
        GC time elapsed (ms)=1197
        CPU time spent (ms)=9890
        Physical memory (bytes) snapshot=706691072
        Virtual memory (bytes) snapshot=12496674816
        Total committed heap usage (bytes)=210841600
    Shuffle Errors
        BAD_ID=0
        CONNECTION=0
        IO_ERROR=0
        WRONG_LENGTH=0
        WRONG_MAP=0
        WRONG_REDUCE=0
    File Input Format Counters 
        Bytes Read=2190
    File Output Format Counters 
        Bytes Written=552

[hadoop@node1 ~]$ hadoop fs -ls /flowsum/provinceout
Found 6 items
-rw-r--r--   2 hadoop supergroup          0 2018-08-07 03:59 /flowsum/provinceout/_SUCCESS
-rw-r--r--   2 hadoop supergroup         55 2018-08-07 03:59 /flowsum/provinceout/part-r-00000
-rw-r--r--   2 hadoop supergroup        102 2018-08-07 03:59 /flowsum/provinceout/part-r-00001
-rw-r--r--   2 hadoop supergroup         24 2018-08-07 03:59 /flowsum/provinceout/part-r-00002
-rw-r--r--   2 hadoop supergroup        105 2018-08-07 03:59 /flowsum/provinceout/part-r-00003
-rw-r--r--   2 hadoop supergroup        266 2018-08-07 03:59 /flowsum/provinceout/part-r-00004
[hadoop@node1 ~]$ hadoop fs -cat /flowsum/provinceout/part-r-00000
13602846565 1938    1938    3876
13660577991 6960    6960    13920
[hadoop@node1 ~]$ hadoop fs -cat /flowsum/provinceout/part-r-00001
13719199419 240 240 480
13726230503 2481    2481    4962
13726238888 2481    2481    4962
13760778710 120 120 240
[hadoop@node1 ~]$ hadoop fs -cat /flowsum/provinceout/part-r-00002
13826544101 264 264 528
[hadoop@node1 ~]$ hadoop fs -cat /flowsum/provinceout/part-r-00003
13922314466 3008    3008    6016
13925057413 11058   11058   22116
13926251106 240 240 480
13926435656 132 132 264
[hadoop@node1 ~]$ hadoop fs -cat /flowsum/provinceout/part-r-00004
13480253104 180 180 360
13502468823 7335    7335    14670
13560436666 1116    1116    2232
13560439658 2034    2034    4068
15013685858 3659    3659    7318
15920133257 3156    3156    6312
15989002119 1938    1938    3876
18211575961 1527    1527    3054
18320173382 9531    9531    19062
84138413    4116    4116    8232