原数据:
PV = foreach logs_grouped generate COUNT(logs);
logs_distinct_ip = foreach logs_grouped_ip generate group;
logs_distinct_ip_grouped = group logs_distinct_ip all;
UV = foreach logs_distinct_ip_grouped generate COUNT(logs_distinct_ip);
ip_num = foreach logs_grouped_ip generate COUNT(logs.ip) as num;
ip_num_1 = filter ip_num by num==1;
ip_num_1_grouped = group ip_num_1 all;
jumper = foreach ip_num_1_grouped generate COUNT(ip_num_1);
band_grouped = group band by b;
band_PV = foreach band_grouped generate group,COUNT(R.b);
band_distinct_ip = distinct band;
数据清洗每行记录有5部分组成:1.访问ip2.访问时间3.访问资源4.访问状态5.本次流量
hive中的数据需要用mr进行处理,然后再导入表中
public static class MyMapper extends
load data inpath "/user/15910312725/logout/2015-03-30" into table logs partition (date='2015-03-30');
Mapper<LongWritable, Text, NullWritable, Text> {
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss" , Locale.ENGLISH);
SimpleDateFormat simpleDateFormat2 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss" );
Date parse;
String date;
@Override
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, NullWritable, Text>.Context context)
throws IOException, InterruptedException {
String line = value.toString();
Pattern p = Pattern
. compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}) - - \\[(.*) \\+0800\\] \"[A-Z]{1,4} (.*) HTTP/1.1\" ([0-9]*) ([0-9]*)");
Matcher m = p.matcher( line);
if (! m.find()) {
return;
}
String ip = m.group(1);
String time = m.group(2);
String url = m.group(3);
String status = m.group(4);
String traffic = m.group(5);
try {
parse = simpleDateFormat.parse( time);
date = simpleDateFormat2.format( parse);
} catch (ParseException e) {
e.printStackTrace();
}
context.write(NullWritable. get(), new Text(ip + "\t" + date + "\t" + url + "\t" + status + "\t" + traffic ));
}
}
hive创建表并导入数据
create external table logs(ip string,time string, url string, status string, traffic string) partitioned by (date string) row format delimited fields terminated by '\t' location '/user/15910312725/logout';
load data inpath "/user/15910312725/logout/2015-03-30" into table logs partition (date='2015-03-30');
pig可直接导入原始数据,然后进行数据清洗
logs_tmp = load 'access_2015_03_30.log' using PigStorage(' ') as
(ip:chararray,one:chararray,two:chararray,time:chararray,timezone:chararray,method:chararray,url:chararray,http:chararray,status:long,data:long);
logs = foreach A generate ip,time,url,status,data;
浏览量(pv)
定义:页面浏览量即为PV(Page View),是指所有用户浏览页面的总和,用户每打开一个页面就被记录1 次。
定义:页面浏览量即为PV(Page View),是指所有用户浏览页面的总和,用户每打开一个页面就被记录1 次。
计算公式:记录计数
hive:
create table bbs_pv_2015_03_30 as select count(ip) as num from logs where date='2015-03-30';
pig:
logs_grouped = group logs all;
PV = foreach logs_grouped generate COUNT(logs);
独立IP数
定义:一天之内,访问网站的不同独立IP 个数加和。其中同一IP无论访问了几个页面,独立IP 数均为1。
定义:一天之内,访问网站的不同独立IP 个数加和。其中同一IP无论访问了几个页面,独立IP 数均为1。
公式:对ip去重求和
hive:
create table bbs_ip_2015_03_30 as select count(distinct ip) as num from logs where date='2015-03-30';
pig:
logs_grouped_ip = group logs by ip;
logs_distinct_ip = foreach logs_grouped_ip generate group;
logs_distinct_ip_grouped = group logs_distinct_ip all;
UV = foreach logs_distinct_ip_grouped generate COUNT(logs_distinct_ip);
注册用户数
计算公式:对访问member.php?mod=register的url,计数
计算公式:对访问member.php?mod=register的url,计数
hive:
create table bbs_register_2015_03_30 as select count(1) as register from logs where date='2015-03-30' and instr(url,'member.php?mod=register')>0;
pig:
split logs into regester if (INDEXOF(url, 'member.php?mod=register')>0), non if (INDEXOF(url, 'member.php?mod=register')<0);
//OR: regester = filter logs by INDEXOF(url,'member.php?mod=register')>0;
regester_grouped = group regester all;
regester_num = foreach regester_grouped generate COUNT(regester);
跳出率=跳出数/PV
计算公式:(1)统计一天内只出现一条访问记录的ip,称为跳出数
(2)跳出数/PV
(2)跳出数/PV
hive:
create table bbs_jumper_2015_03_30 as select count(a.c) as jumper from (select ip,count(ip) as c from logs where date='2015-03-30' group by ip) as a where a.c=1;
pig:
logs_grouped_ip = group logs by ip;
ip_num_1 = filter ip_num by num==1;
ip_num_1_grouped = group ip_num_1 all;
jumper = foreach ip_num_1_grouped generate COUNT(ip_num_1);
板块访问量pv
定义:一天之内每个板块的访问量
分析:板块的链接url:/forum-110-1.html,其中110表示板块的ID,1表示这个板块的第一页,只需要计算url中存在/forum-字符串的数据
公式:对板块分组,求每个板块的访问量
定义:一天之内每个板块的访问量
分析:板块的链接url:/forum-110-1.html,其中110表示板块的ID,1表示这个板块的第一页,只需要计算url中存在/forum-字符串的数据
公式:对板块分组,求每个板块的访问量
hive:
create table bbs_band_ip_2015_03_30 as select b.band as band,count(ip) as ip_num from (select regexp_extract(url,'/forum-(\\d*)-(\\d*)',1) as band,ip from logs where date='2015-03-30' and instr(url,'/forum-')>0) as b group by b.band;
pig:
split logs into logs_contain_band if ((REGEX_EXTRACT(url,'/forum-([0-9]*)-([0-9]*)',1))!=''), non if ((REGEX_EXTRACT(url,'forum-([0-9]*)-([0-9]*)',1))=='');
//logs_contain_band = filter logs by REGEX_EXTRACT(url,'forum-([0-9]*)-([0-9]*)',1)!='';band = foreach logs_contain_band generate REGEX_EXTRACT(url,'forum-([0-9]*)-([0-9]*)',1) as b;
band_grouped = group band by b;
band_PV = foreach band_grouped generate group,COUNT(R.b);
板块独立IP访问量
定义:一天之内每个板块的独立IP访问量
分析:板块的链接url:/forum-110-1.html,其中110表示板块的ID,1表示这个板块的第一页,只需要计算url中存在/forum-字符串的数据,然后对IP进行去重
公式:对板块分组,IP去重,求每个板块的独立IP访问量
定义:一天之内每个板块的独立IP访问量
分析:板块的链接url:/forum-110-1.html,其中110表示板块的ID,1表示这个板块的第一页,只需要计算url中存在/forum-字符串的数据,然后对IP进行去重
公式:对板块分组,IP去重,求每个板块的独立IP访问量
hive:
create table bbs_band_pv_2015_03_30 as select b.band as band,count(distinct ip) as pv_num from (select regexp_extract(url,'/forum-(\\d*)-(\\d*)',1) as band,ip from logs where date='2015-03-30' and instr(url,'/forum-')>0) as b group by b.band;
pig:
band_all_ip = foreach logs_contain_band generate REGEX_EXTRACT(url,'forum-([0-9]*)-([0-9]*)',1) as b,ip;
band_distinct_ip_grouped = group band_distinct_ip by b;
band_UV = foreach band_distinct_ip_grouped generate group,COUNT(band_distinct_ip.b);
汇总表(bbs_info)
hive:日期 log_date浏览量 pv注册用户数 reguser独立ip数 ip跳出数 jumper
create table bbs_info as select '2015_03_30',a.dist_ip,b.num,c.register,d.jumper from bbs_ip_2015_03_30 a join bbs_pv_2015_03_30 b on 1=1 join bbs_register_2015_03_30 c on 1=1 join bbs_jumper_2015_03_30 d on 1=1;
pig:
bbs_info = join PV by '1=1', UV by '1=1', jumper_num by '1=1';
版块访问表(bbs_forum)
hive:日期 log_date版块 forum独立ip ip浏览量 pv
create table bbs_forum as select a.band,ip_num,pv_num from bbs_band_ip_2015_03_30 as a join bbs_band_pv_2015_03_30 as b on a.band=b.band;
pig:
bbs_forum = join band_PV by $0, band_UV by $0;