hive、pig对日志处理的对比

1 篇文章 0 订阅
原数据:
每行记录有5部分组成:
1.访问ip
2.访问时间
3.访问资源
4.访问状态
5.本次流量

数据清洗
     hive中的数据需要用mr进行处理,然后再导入表中
     
          public static class MyMapper extends
                Mapper<LongWritable, Text, NullWritable, Text> {
           SimpleDateFormat simpleDateFormat = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss" , Locale.ENGLISH);
           SimpleDateFormat simpleDateFormat2 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss" );
           Date parse;
           String date;

            @Override
            protected void map(LongWritable key, Text value,
                     Mapper<LongWritable, Text, NullWritable, Text>.Context context)
                      throws IOException, InterruptedException {
                String line = value.toString();
                Pattern p = Pattern
                          . compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}) - - \\[(.*) \\+0800\\] \"[A-Z]{1,4} (.*) HTTP/1.1\" ([0-9]*) ([0-9]*)");
                Matcher m = p.matcher( line);

                 if (! m.find()) {
                      return;
                }

                String ip = m.group(1);
                String time = m.group(2);
                String url = m.group(3);
                String status = m.group(4);
                String traffic = m.group(5);

                 try {
                      parse = simpleDateFormat.parse( time);
                      date = simpleDateFormat2.format( parse);
                } catch (ParseException e) {
                      e.printStackTrace();
                }

                 context.write(NullWritable. get(), new Text(ip + "\t" + date + "\t" + url + "\t" + status + "\t" + traffic ));
           }
     }

     hive创建表并导入数据
          create external table logs(ip string,time string, url string, status string, traffic string) partitioned by (date string) row format delimited fields terminated by '\t' location '/user/15910312725/logout';

                  load data inpath "/user/15910312725/logout/2015-03-30" into table logs partition (date='2015-03-30');

     pig可直接导入原始数据,然后进行数据清洗

              logs_tmp = load 'access_2015_03_30.log' using PigStorage(' ') as
               (ip:chararray,one:chararray,two:chararray,time:chararray,timezone:chararray,method:chararray,url:chararray,http:chararray,status:long,data:long);

              logs = foreach A generate ip,time,url,status,data;


浏览量(pv) 
 
定义:页面浏览量即为PV(Page View),是指所有用户浏览页面的总和,用户每打开一个页面就被记录1 次。
计算公式:记录计数

     hive:
          create table bbs_pv_2015_03_30 as select count(ip) as num from logs where date='2015-03-30';

     pig:
          logs_grouped = group logs all;

          PV = foreach logs_grouped generate COUNT(logs);


独立IP数
 
定义:一天之内,访问网站的不同独立IP 个数加和。其中同一IP无论访问了几个页面,独立IP 数均为1。
公式:对ip去重求和

     hive:
          create table bbs_ip_2015_03_30 as select count(distinct ip) as num from logs where date='2015-03-30'; 

     pig:
          logs_grouped_ip = group logs by ip;

          logs_distinct_ip = foreach logs_grouped_ip generate group;

          logs_distinct_ip_grouped = group logs_distinct_ip all;

          UV = foreach logs_distinct_ip_grouped generate COUNT(logs_distinct_ip);


注册用户数 

计算公式:对访问member.php?mod=register的url,计数

     hive:
          create table bbs_register_2015_03_30 as select count(1) as register from logs where date='2015-03-30' and instr(url,'member.php?mod=register')>0;

     pig:
          split logs into regester if (INDEXOF(url, 'member.php?mod=register')>0), non if (INDEXOF(url, 'member.php?mod=register')<0);
          //OR: regester = filter logs by INDEXOF(url,'member.php?mod=register')>0;

          regester_grouped = group regester all;

          regester_num = foreach regester_grouped generate COUNT(regester);


跳出率=跳出数/PV

计算公式:(1)统计一天内只出现一条访问记录的ip,称为跳出数
                (2)跳出数/PV
     hive:
          create table bbs_jumper_2015_03_30 as select count(a.c) as jumper from (select ip,count(ip) as c from logs where date='2015-03-30' group by ip) as a where a.c=1;

     pig:
          logs_grouped_ip = group logs by ip;

          ip_num = foreach logs_grouped_ip generate COUNT(logs.ip) as num;

          ip_num_1 = filter ip_num by num==1;

          ip_num_1_grouped = group ip_num_1 all;

          jumper = foreach ip_num_1_grouped generate COUNT(ip_num_1);


板块访问量pv

定义:一天之内每个板块的访问量

分析:板块的链接url:/forum-110-1.html,其中110表示板块的ID,1表示这个板块的第一页,只需要计算url中存在/forum-字符串的数据

公式:对板块分组,求每个板块的访问量

     hive:
          create table bbs_band_ip_2015_03_30 as select b.band as band,count(ip) as ip_num from (select regexp_extract(url,'/forum-(\\d*)-(\\d*)',1) as band,ip from logs where date='2015-03-30' and instr(url,'/forum-')>0) as b group by b.band; 

     pig:
          split logs into logs_contain_band if ((REGEX_EXTRACT(url,'/forum-([0-9]*)-([0-9]*)',1))!=''), non if ((REGEX_EXTRACT(url,'forum-([0-9]*)-([0-9]*)',1))=='');
               //logs_contain_band = filter logs by REGEX_EXTRACT(url,'forum-([0-9]*)-([0-9]*)',1)!='';

  band = foreach logs_contain_band generate REGEX_EXTRACT(url,'forum-([0-9]*)-([0-9]*)',1) as b;

          band_grouped = group band by b;

          band_PV = foreach band_grouped generate group,COUNT(R.b);


板块独立IP访问量

定义:一天之内每个板块的独立IP访问量

分析:板块的链接url:/forum-110-1.html,其中110表示板块的ID,1表示这个板块的第一页,只需要计算url中存在/forum-字符串的数据,然后对IP进行去重

公式:对板块分组,IP去重,求每个板块的独立IP访问量

     hive:
          create table bbs_band_pv_2015_03_30 as select b.band as band,count(distinct ip) as pv_num from (select regexp_extract(url,'/forum-(\\d*)-(\\d*)',1) as band,ip from logs where date='2015-03-30' and instr(url,'/forum-')>0) as b group by b.band; 

     pig:
          band_all_ip = foreach logs_contain_band generate REGEX_EXTRACT(url,'forum-([0-9]*)-([0-9]*)',1) as b,ip;

          band_distinct_ip = distinct band;

          band_distinct_ip_grouped = group band_distinct_ip by b;

          band_UV = foreach band_distinct_ip_grouped generate group,COUNT(band_distinct_ip.b);


汇总表(bbs_info)
日期 log_date
浏览量 pv
注册用户数 reguser
独立ip数 ip
跳出数 jumper

     hive:
          create table bbs_info as select '2015_03_30',a.dist_ip,b.num,c.register,d.jumper from bbs_ip_2015_03_30 a join bbs_pv_2015_03_30 b on 1=1 join bbs_register_2015_03_30 c on 1=1 join bbs_jumper_2015_03_30 d on 1=1;

     pig:
          bbs_info = join PV by '1=1', UV by '1=1', jumper_num by '1=1';


版块访问表(bbs_forum)
日期 log_date
版块 forum
独立ip ip
浏览量 pv

     hive:
          create table bbs_forum as select a.band,ip_num,pv_num from bbs_band_ip_2015_03_30 as a join bbs_band_pv_2015_03_30 as b on a.band=b.band;

     pig:
          bbs_forum = join band_PV by $0, band_UV by $0;
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值