Hadoop（5）：MapReduce编程统计PV

最新推荐文章于 2023-06-11 15:52:48 发布

RayBreslin

最新推荐文章于 2023-06-11 15:52:48 发布

阅读量480

点赞数

分类专栏： hadoop Yarn 文章标签： Hadoop MapReduce PV

本文链接：https://blog.csdn.net/u010886217/article/details/89323406

版权

hadoop 同时被 2 个专栏收录

25 篇文章 1 订阅

订阅专栏

Yarn

7 篇文章 0 订阅

订阅专栏

一、目的

对于用户访问日志，使用MapReduce进行编程分析，获得每个城市的访问量。

二、实现环境

1.CDH 5.7.0

2.CentOS 7.4

3.集群外部Hadoop开发环境部署

参考博客：https://blog.csdn.net/u010886217/article/details/89278390

三、实现代码

1.研究日志格式

2.MapReduce实现统计日志代码

package Hadoop;
import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class PVMapReduce extends Configured implements Tool{
    //map类
    //继承Mapper类，<KEYIN, VALUEIN, KEYOUT, VALUEOUT>   输入的key，输入的value，输出的key，输出的value
    public static class MyPVMapper extends Mapper<LongWritable, Text, IntWritable, IntWritable>{
        IntWritable province_id = new IntWritable();
        IntWritable mr_value = new IntWritable(1);
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, IntWritable, IntWritable>.Context context)
                throws IOException, InterruptedException {
            //(pro_id,1)
            //获得一整条数据
            String line = value.toString();
            //将数据进行切割获得一串数组
            String [] str = line.split("\t");
            //切割之后数组长度小于30，认为这条数据字段缺失，丢弃
            if(str.length <= 30){
                context.getCounter("我的计数器","长度小于30的数据").increment(1);
                return;
            }
            //取出数组下标为23的字符串  =》 pro_id
            String pro_id = str[23];
            String url = str[1];
            /**
             * 其实很多情况下，数据是不完整的，所以在map方法当中，我们要对数据进行清洗
             * 做if判断，去掉不符合逻辑的数据
             */
            if(StringUtils.isBlank(url)){
                context.getCounter("我的计数器","url为空的数据").increment(1);
                return;
            }

            int pro_int = -1;
            try{
                pro_int =  Integer.parseInt(pro_id);
            }catch(Exception e){
                context.getCounter("我的计数器","数值转换异常的数据").increment(1);
                return;
            }

            //最后要具体查看数据结果，判断是否符合我们的要求（可能还要做更多的过滤）
            province_id.set(pro_int);
            context.write(province_id, mr_value);
        }
    }


    //combiner
    public static class MyPVcombiner extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable>{
        IntWritable total = new IntWritable();
        @Override
        protected void reduce(IntWritable key, Iterable<IntWritable> values,
                              Reducer<IntWritable, IntWritable, IntWritable, IntWritable>.Context context) throws IOException, InterruptedException {
            int count = -1;
            for (IntWritable intWritable : values) {
                count += intWritable.get();
            }
            total.set(count);
            context.write(key, total);
        }

    }

    //reduce类
    //	reduce类的输入，其实就是map类中map方法的输出							输入key  输入value  输出key  输出value
    public static class MyPVReducer extends Reducer<IntWritable, IntWritable, IntWritable, IntWritable>{
        IntWritable total = new IntWritable();
        @Override
        protected void reduce(IntWritable key, Iterable<IntWritable> values,
                              Reducer<IntWritable, IntWritable, IntWritable, IntWritable>.Context context) throws IOException, InterruptedException {

            int count = -1;
            for (IntWritable intWritable : values) {
                count += intWritable.get();
            }
            total.set(count);
            context.write(key, total);
        }

    }
    //运行类，run方法，在测试的时候使用main函数，调用这个类的run方法来运行
    /**
     *
     * @param args 参数是要接受main方法得到的参数，在run中使用
     * @return
     * @throws Exception
     */
    public int run(String[] args) throws Exception {
        //通过调用this的getConf方法得到从外部传入的conf对象
        Configuration conf = this.getConf();

        Job job = Job.getInstance(conf,this.getClass().getSimpleName());

        job.setJarByClass(PVMapReduce.class);

        //输入路径
        Path inpath = new Path(args[0]);
        FileInputFormat.addInputPath(job, inpath);
        //输出路径
        Path outpath = new Path(args[1]);
        FileOutputFormat.setOutputPath(job, outpath);

        //执行前先判断输出路径是否存在，存在就将该路径删除
        FileSystem fs = outpath.getFileSystem(conf);
        if(fs.exists(outpath)){
            fs.delete(outpath,true);
        }

        //设置Map相关参数
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setMapperClass(MyPVMapper.class);

        //设置shuffle
        job.setCombinerClass(MyPVcombiner.class);

        //设置reduce相关参数
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(IntWritable.class);
        job.setReducerClass(MyPVReducer.class);

        int isSuccess =  job.waitForCompletion(true)?0:1;

        return isSuccess;
    }


    public static void main(String[] args) {
        Configuration conf = new Configuration();
        args = new String[]{
                "hdfs://hadoop01:8020/20150812",
                "hdfs://hadoop01:8020/out"
        };
        try {
            int isSucces =  ToolRunner.run(conf,new PVMapReduce(), args);
            System.out.println("isSuccess"+isSucces);
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

    }


}

RayBreslin

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
Hadoop（5）：MapReduce编程统计PV

一、目的对于用户访问日志，使用MapReduce进行编程分析，获得每个城市的访问量。二、实现环境1.CDH 5.7.02.CentOS 7.43.集群外部Hadoop开发环境部署参考博客：https://blog.csdn.net/u010886217/article/details/89278390三、实现代码1.研究日志格式2.MapReduce实现统计...
复制链接

扫一扫