数据采集到数据分析案例
ETC
模拟ETC流程
0.Flume采集
1.将数据通过Flume从指定位置采集到hdfs(/app-log-data/data/2019-07-*);
2.将mr程序打成jar备用,
package com.initialize;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
public class AppLogDataClean {
public static class AppLogDataCleanMapper extends Mapper<LongWritable, Text, Text, NullWritable>{
Text k = null;
NullWritable v = null;
SimpleDateFormat sdf = null;
MultipleOutputs<Text, NullWritable> mos = null;//多路输出器
@Override
protected void setup(Context context) throws IOException, InterruptedException {
k = new Text();
v = NullWritable.get();
sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
mos = new MultipleOutputs<Text, NullWritable>(context);
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
JSONObject jsonObj = JSON.parseObject(value.toString());
JSONObject headerObj = jsonObj.getJSONObject(GlobalConstants.HEADER);
/**
* 过滤缺失必选字段的记录
*/
if(null == headerObj.getString("sdk_ver") || "".equals(headerObj.getString("sdk_ver"))){return;}
if(null == headerObj.getString("time_zone") || "".equals(headerObj.getString("time_zone"))){return;}
if(null == headerObj.getString("commit_id") || "".equals(headerObj.getString("commit_id"))){return;}
if(null == headerObj.getString("commit_time") || "".equals(headerObj.getString("commit_time"))){return;}
else{
//练习时追加的逻辑,替换掉原始数据中的时间撮
String commit_time = headerObj.getString("commit_time");
String format = sdf.format(new Date(new Date().getTime() - 24 * 60 * 60 * 1000L));
//String format = sdf.format(new Date());
headerObj.put("commit_time", format);
}
if(null == headerObj.getString("pid") || "".equals(headerObj.getString("pid"))){return;}
if(null == headerObj.getString("app_token") || "".equals(headerObj.getString("app_token"))){return;}
if(null == headerObj.getString("app_id") || "".equals(headerObj.getString("app_id"))){return;}
if(null == headerObj.getString("device_id") || "".equals(headerObj.getString("device_id"))){return;}
if(null == headerObj.getString("device_id_type") || "".equals(headerObj.getString("device_id_type"))){return;}
if(null == headerObj.getString("release_channel") || "".equals(headerObj.getString("release_channel"))){return;}
if(null == headerObj.getString("app_ver_name") || "".equals(headerObj.getString("app_ver_name"))){return;}
if(null == headerObj.getString("app_ver_code") || "".equals(headerObj.getString("app_ver_code"))){return;}
if(null == headerObj.getString("os_name") || "".equals(headerObj.getString("os_name"))){return;}
if(null == headerObj.getString("os_ver") || "".equals(headerObj.getString("os_ver"))){return;}
if(null == headerObj.getString("language") || "".equals(headerObj.getString("language"))){return;}
if(null == headerObj.getString("country") || "".equals(headerObj.getString("country"))){return;}
if(null == headerObj.getString("manufacture") || "".equals(headerObj.getString("manufacture"))){return;}
if(null == headerObj.getString("device_model") || "".equals(headerObj.getString("device_model"))){return;}
if(null == headerObj.getString("resolution") || "".equals(headerObj.getString("resolution"))){return;}
if(null == headerObj.getString("net_type") || "".equals(headerObj.getString("net_type"))){return;}
/**
* 生成user_id
*/
String user_id = "";
if("android".equals(headerObj.getString("os_name").trim())){
user_id = StringUtils.isNotBlank(headerObj.getString("android_id"))?headerObj.getString("android_id"):headerObj.getString("device_id");
}else{
user_id = headerObj.getString("device_id");
}
/**
* 输出结果
*/
headerObj.put("user_id", user_id);
k.set(JsonToStringUtil.toString(headerObj));
if("android".equals(headerObj.getString("os_name"))){
mos.write(k, v, "android/android");
}else {
mos.write(k, v, "ios/ios");
}
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
mos.close();
}
}
public static void main(String[] args) throws Exception{
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(AppLogDataClean.class);
job.setMapperClass(AppLogDataCleanMapper.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.setNumReduceTasks(0);
//job.setOutputFormatClass(TextOutputFormat.class); //默认
//避免生成默认的part-m-00000文件,因为,数据已经交给MultipleOutputs输出了
LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean res = job.waitForCompletion(true);
System.exit(res? 0:1);
}
}
package com.initialize;
public class GlobalConstants {
public static final String HEADER = "header";
}
package com.initialize;
import com.alibaba.fastjson.JSONObject;
public class JsonToStringUtil {
public static String toString(JSONObject jsonObj) {
StringBuilder sb = new StringBuilder();
sb.append(jsonObj.getString("sdk_ver")).append("\001")
.append(jsonObj.getString("time_zone")).append("\001")
.append(jsonObj.getString("commit_id")).append("\001")
.append(jsonObj.getString("commit_time")).append("\001")
.append(jsonObj.getString("pid")).append("\001")
.append(jsonObj.getString("app_token")).append("\001")
.append(jsonObj.getString("app_id")).append("\001")
.append(jsonObj.getString("device_id")).append("\001")
.append(jsonObj.getString("device_id_type")).append("\001")
.append(jsonObj.getString("release_channel")).append("\001")
.append(jsonObj.getString("app_ver_name")).append("\001")
.append(jsonObj.getString("app_ver_code")).append("\001")
.append(jsonObj.getString("os_name")).append("\001")
.append(jsonObj.getString("os_ver")).append("\001")
.append(jsonObj.getString("language")).append("\001")
.append(jsonObj.getString("country")).append("\001")
.append(jsonObj.getString("manufacture")).append("\001")
.append(jsonObj.getString("device_model")).append("\001")
.append(jsonObj.getString("resolution")).append("\001")
.append(jsonObj.getString("net_type")).append("\001")
.append(jsonObj.getString("account")).append("\001")
.append(jsonObj.getString("app_device_id")).append("\001")
.append(jsonObj.getString("mac")).append("\001")
.append(jsonObj.getString("android_id")).append("\001")
.append(jsonObj.getString("imei")).append("\001")
.append(jsonObj.getString("cid_sn")).append("\001")
.append(jsonObj.getString("build_num")).append("\001")
.append(jsonObj.getString("mobile_data_type")).append("\001")
.append(jsonObj.getString("promotion_channel")).append("\001")
.append(jsonObj.getString("carrier")).append("\001")
.append(jsonObj.getString("city")).append("\001")
.append(jsonObj.getString("user_id"))
;
return sb.toString();
}
}
3.编写shell脚本,定时进行ETC.
#!/bin/bash
#day_str=`date +'%Y-%m-%d'`
day_str=`date +'%Y-%m-%d' -d '-1 days'`
inpath=/app-log-data/data/$day_str
outpath=/app-log-data/clean/$day_str
echo "准备清洗$day_str 的数据......"
/appdata/hadoop/bin/hadoop jar /home/lys/app-data-24-1.0-SNAPSHOT.jar com.initialize.AppLogDataClean $inpath $outpath
4.运行脚本
sh ./action_day.sh
5.运行结果
日活用户线观统计
各维度组合分析:
不区分操作系统os_name 不区分城市city 不区分渠道release_channel 不区分版本app_ver_name 活跃用户
区分操作系统os_name 不区分城市city 不区分渠道release_channel 不区分版本app_ver_name 活跃用户
不区分操作系统os_name 区分城市city 不区分渠道release_channel 不区分版本app_ver_name 活跃用户
不区分操作系统os_name 不区分城市city 区分渠道release_channel 不区分版本app_ver_name 活跃用户
不区分操作系统os_name 不区分城市city 不区分渠道release_channel 区分版本app_ver_name 活跃用户
区分操作系统os_name 区分城市city 不区分渠道release_channel 不区分版本app_ver_name 活跃用户
…
维度组合统计
0 0 0 0
0 0 0 1
0 0 1 0
0 0 1 1
0 1 0 0
0 1 0 1
0 1 1 0
0 1 1 1
1 0 0 0
1 0 0 1
1 0 1 0
1 0 1 1
1 1 0 0
1 1 0 1
1 1 1 0
1 1 1 1
0.创建ods_app_log表
CREATE TABLE ods_app_log (
sdk_ver string
,time_zone string
,commit_id string
,commit_time string
,pid string
,app_token string
,app_id string
,device_id string
,device_id_type string
,release_channel string
,app_ver_name string
,app_ver_code string
,os_name string
,os_ver string
,language string
,country string
,manufacture string
,device_model string
,resolution string
,net_type string
,account string
,app_device_id string
,mac string
,android_id string
,imei string
,cid_sn string
,build_num string
,mobile_data_type string
,promotion_channel string
,carrier string
,city string
,user_id string
) partitioned BY (day string, os string) row format delimited fields terminated BY '\001';
导入数据
alter table ods_app_log add partition(day=‘2019-07-21’,os=‘android’) location ‘/app-log-data/clean/2019-07-21/android’;
展示分区
show partitions ods_app_log;
删除分区
alter table ods_app_log drop partition (day=‘2017-09-22’,os=‘android’);
1/ 把当天的活跃用户信息抽取出来,存入一个日活用户信息表
1.1/ 建日活用户信息表
CREATE TABLE etl_user_active_day (
sdk_ver string
,time_zone string
,commit_id string
,commit_time string
,pid string
,app_token string
,app_id string
,device_id string
,device_id_type string
,release_channel string
,app_ver_name string
,app_ver_code string
,os_name string
,os_ver string
,language string
,country string
,manufacture string
,device_model string
,resolution string
,net_type string
,account string
,app_device_id string
,mac string
,android_id string
,imei string
,cid_sn string
,build_num string
,mobile_data_type string
,promotion_channel string
,carrier string
,city string
,user_id string
) partitioned BY (day string) row format delimited fields terminated BY '\001';
1.2 从ods_app_log原始数据表的当天分区中,抽取当日的日活用户信息插入日活用户信息表etl_user_active_day
注意点:每个活跃用户抽取他当天所有记录中时间最早的一条;
INSERT INTO TABLE etl_user_active_day PARTITION (day = '2019-07-22')
SELECT sdk_ver
,time_zone
,commit_id
,commit_time
,pid
,app_token
,app_id
,device_id
,device_id_type
,release_channel
,app_ver_name
,app_ver_code
,os_name
,os_ver
,LANGUAGE
,country
,manufacture
,device_model
,resolution
,net_type
,account
,app_device_id
,mac
,android_id
,imei
,cid_sn
,build_num
,mobile_data_type
,promotion_channel
,carrier
,city
,user_id
FROM (
SELECT *
,row_number() OVER (
PARTITION BY user_id ORDER BY commit_time
) AS rn
FROM ods_app_log
WHERE day = '2019-07-22'
) tmp
WHERE rn = 1;
维度统计
建维度统计结果表 dim_user_active_day
DROP TABLE dim_user_active_day;
CREATE TABLE dim_user_active_day (
os_name string
,city string
,release_channel string
,app_ver_name string
,cnts INT
) partitioned BY (
day string
,dim string
);
利用多重insert语法来统计各种维度组合的日活用户数,并插入到日活维度统计表的各分区中;
FROM etl_user_active_day
INSERT INTO TABLE dim_user_active_day PARTITION (
day = '2019-07-21'
,dim = '0000'
)
SELECT 'all'
,'all'
,'all'
,'all'
,count(1)
WHERE day = '2019-07-21'
INSERT INTO TABLE dim_user_active_day PARTITION (
day = '2019-07-21'
,dim = '1000'
)
SELECT os_name
,'all'
,'all'
,'all'
,count(1)
WHERE day = '2019-07-21'
GROUP BY (os_name)
INSERT INTO TABLE dim_user_active_day PARTITION (
day = '2019-07-21'
,dim = '0100'
)
SELECT 'all'
,city
,'all'
,'all'
,count(1)
WHERE day = '2019-07-21'
GROUP BY (city)
INSERT INTO TABLE dim_user_active_day PARTITION (
day = '2019-07-21'
,dim = '0010'
)
SELECT 'all'
,'all'
,release_channel
,'all'
,count(1)
WHERE day = '2019-07-21'
GROUP BY (release_channel)
INSERT INTO TABLE dim_user_active_day PARTITION (
day = '2019-07-21'
,dim = '0001'
)
SELECT 'all'
,'all'
,'all'
,app_ver_name
,count(1)
WHERE day = '2019-07-21'
GROUP BY (app_ver_name)
INSERT INTO TABLE dim_user_active_day PARTITION (
day = '2019-07-21'
,dim = '1100'
)
SELECT os_name
,city
,'all'
,'all'
,count(1)
WHERE day = '2019-07-21'
GROUP BY os_name,city
INSERT INTO TABLE dim_user_active_day PARTITION (
day = '2019-07-21'
,dim = '1010'
)
SELECT os_name
,'all'
,release_channel
,'all'
,count(1)
WHERE day = '2019-07-21'
GROUP BY os_name,release_channel
INSERT INTO TABLE dim_user_active_day PARTITION (
day = '2019-07-21'
,dim = '1001'
)
SELECT os_name
,'all'
,'all'
,app_ver_name
,count(1)
WHERE day = '2019-07-21'
GROUP BY os_name,app_ver_name
INSERT INTO TABLE dim_user_active_day PARTITION (
day = '2019-07-21'
,dim = '0110'
)
SELECT 'all'
,city
,release_channel
,'all'
,count(1)
WHERE day = '2019-07-21'
GROUP BY city,release_channel
INSERT INTO TABLE dim_user_active_day PARTITION (
day = '2019-07-21'
,dim = '0101'
)
SELECT 'all'
,city
,'all'
,app_ver_name
,count(1)
WHERE day = '2019-07-21'
GROUP BY city,app_ver_name
INSERT INTO TABLE dim_user_active_day PARTITION (
day = '2019-07-21'
,dim = '0011'
)
SELECT 'all'
,'all'
,release_channel
,app_ver_name
,count(1)
WHERE day = '2019-07-21'
GROUP BY release_channel,app_ver_name
INSERT INTO TABLE dim_user_active_day PARTITION (
day = '2019-07-21'
,dim = '0111'
)
SELECT 'all'
,city
,release_channel
,app_ver_name
,count(1)
WHERE day = '2019-07-21'
GROUP BY city,release_channel,app_ver_name
INSERT INTO TABLE dim_user_active_day PARTITION (
day = '2019-07-21'
,dim = '1011'
)
SELECT os_name
,'all'
,release_channel
,app_ver_name
,count(1)
WHERE day = '2019-07-21'
GROUP BY os_name,release_channel,app_ver_name
INSERT INTO TABLE dim_user_active_day PARTITION (
day = '2019-07-21'
,dim = '1101'
)
SELECT os_name
,city
,'all'
,app_ver_name
,count(1)
WHERE day = '2019-07-21'
GROUP BY os_name,city,app_ver_name
INSERT INTO TABLE dim_user_active_day PARTITION (
day = '2019-07-21'
,dim = '1110'
)
SELECT os_name
,city
,release_channel
,'all'
,count(1)
WHERE day = '2019-07-21'
GROUP BY os_name,city,release_channel
INSERT INTO TABLE dim_user_active_day PARTITION (
day = '2019-07-21'
,dim = '1111'
)
SELECT os_name
,city
,release_channel
,app_ver_name
,count(1)
WHERE day = '2019-07-21'
GROUP BY os_name,city,release_channel,app_ver_name
;
日新用户相关统计
日新:当日第一次出现的用户–当日的新增用户
思路: a、应该建立一个历史用户表(只存user_id)
b、将当日的活跃用户去 比对 历史用户表, 就知道哪些人是今天新出现的用户 --> 当日新增用户
c、将当日新增用户追加到历史用户表
历史用户表
create table etl_user_history(user_id string);
当日新增用户表
:存所有字段(每个人时间最早的一条),带有一个分区字段:day string;
create table etl_user_new_day like etl_user_active_day;
1 当日活跃-历史用户表 --> 新增用户表的当日分区
insert into etl_user_new_day partition(day='2019-07-21')
SELECT sdk_ver
,time_zone
,commit_id
,commit_time
,pid
,app_token
,app_id
,device_id
,device_id_type
,release_channel
,app_ver_name
,app_ver_code
,os_name
,os_ver
,LANGUAGE
,country
,manufacture
,device_model
,resolution
,net_type
,account
,app_device_id
,mac
,android_id
,imei
,cid_sn
,build_num
,mobile_data_type
,promotion_channel
,carrier
,city
,a.user_id
from etl_user_active_day a left join etl_user_history b on a.user_id = b.user_id
where a.day='2019-07-21' and b.user_id is null;
2 将当日新增用户的user_id追加到历史表
insert into table etl_user_history
select user_id from etl_user_new_day where day=‘2019-07-21’;
日新:维度统计报表
思路: a、从日新etl表中,按照维度组合,统计出各种维度组合下的新用户数量
维度:
os_name city release_channel app_ver_name
维度组合统计
0 0 0 0
0 0 0 1
0 0 1 0
0 0 1 1
0 1 0 0
0 1 0 1
0 1 1 0
0 1 1 1
1 0 0 0
1 0 0 1
1 0 1 0
1 0 1 1
1 1 0 0
1 1 0 1
1 1 1 0
1 1 1 1
1 日新维度统计报表–数据建模
create table dim_user_new_day(os_name string,city string,release_channel string,app_ver_name string,cnts int)
partitioned by (day string, dim string);
2 日新维度统计报表sql开发(利用多重插入语法)
from etl_user_new_day
insert into table dim_user_new_day partition(day='2017-09-21',dim='0000')
select 'all','all','all','all',count(1)
where day='2019-07-21'
insert into table dim_user_new_day partition(day='2017-09-21',dim='0001')
select 'all','all','all',app_ver_name,count(1)
where day='2019-07-21'
group by app_ver_name
insert into table dim_user_new_day partition(day='2017-09-21',dim='0010')
select 'all','all',release_channel,'all',count(1)
where day='2019-07-21'
group by release_channel
insert into table dim_user_new_day partition(day='2017-09-21',dim='0011')
select 'all','all',release_channel,app_ver_name,count(1)
where day='2019-07-21'
group by release_channel,app_ver_name
insert into table dim_user_new_day partition(day='2017-09-21',dim='0100')
select 'all',city,'all','all',count(1)
where day='2019-07-21'
group by city
;
次日留存用户分析
概念:昨日新增,今日还活跃
逻辑思路:昨天在新用户表中,今天在活跃用户表中 --> 今日的“次日留存用户”
昨天的新用户表中,存在于今天的活跃用户表中的人 --> 今日的“次日留存用户”
数据建模
建次日留存etl信息表
记录跟活跃用户表相同的字段
create table etl_user_keepalive_nextday like etl_user_active_day;
etl开发
insert into table etl_user_keepalive_nextday partition(day='2019-07-22')
select
actuser.sdk_ver
,actuser.time_zone
,actuser.commit_id
,actuser.commit_time
,actuser.pid
,actuser.app_token
,actuser.app_id
,actuser.device_id
,actuser.device_id_type
,actuser.release_channel
,actuser.app_ver_name
,actuser.app_ver_code
,actuser.os_name
,actuser.os_ver
,actuser.language
,actuser.country
,actuser.manufacture
,actuser.device_model
,actuser.resolution
,actuser.net_type
,actuser.account
,actuser.app_device_id
,actuser.mac
,actuser.android_id
,actuser.imei
,actuser.cid_sn
,actuser.build_num
,actuser.mobile_data_type
,actuser.promotion_channel
,actuser.carrier
,actuser.city
,actuser.user_id
from etl_user_new_day newuser join etl_user_active_day actuser
on newuser.user_id = actuser.user_id
where newuser.day='2019-07-21' and actuser.day='2019-07-22';
删除指定分区数据
ALTER TABLE etl_user_keepalive_nextday DROP IF EXISTS PARTITION(day=‘2019-07-22’);
用左半连接效率略高
insert into table etl_user_keepalive_nextday partition(day='2019-07-22')
select
sdk_ver
,time_zone
,commit_id
,commit_time
,pid
,app_token
,app_id
,device_id
,device_id_type
,release_channel
,app_ver_name
,app_ver_code
,os_name
,os_ver
,language
,country
,manufacture
,device_model
,resolution
,net_type
,account
,app_device_id
,mac
,android_id
,imei
,cid_sn
,build_num
,mobile_data_type
,promotion_channel
,carrier
,city
,user_id
from etl_user_new_day a left semi join etl_user_active_day b
on a.user_id = b.user_id and a.day='2019-07-21' and b.day='2019-07-22';
where a.day='2019-07-21' and b.day='2019-07-22'; // 注意:left semi join中,右表的引用不能出现在where条件中
维度统计
利用多重插入语法
版本轨迹分析
利用hive的窗口分析函数解决问题
2017-08-14,赵老师,共享女友,安智市场,北京,v1.2
2017-08-14,赵老师,共享女友,安智市场,北京,v1.2
2017-08-14,赵老师,共享女友,安智市场,北京,v1.2
2017-08-14,许老师,共享女友,360应用,天津,v1.2
2017-08-14,许老师,共享女友,360应用,天津,v1.2
2017-08-14,许老师,共享女友,360应用,天津,v1.2
2017-08-14,赵老师,共享女友,安智市场,北京,v1.2
2017-08-14,许老师,共享女友,360应用,天津,v1.2
2017-08-14,许老师,共享女友,360应用,天津,v1.2
2017-08-14,许老师,共享女友,360应用,天津,v1.2
2017-08-14,许老师,共享女友,360应用,天津,v1.2
2017-08-14,许老师,共享女友,小米应用,天津,v2.0
解决方案:
create table t_lag_test(day string,user_id string,app_token string,release_channel string,city string,app_ver_name string)
row format delimited fields terminated by ',';
load data local inpath '/home/lys/ver.test' into table t_lag_test;
select
day,user_id,app_token,release_channel,city,ver_2,app_ver_name
from
(
select
day,user_id,app_token,release_channel,city,app_ver_name,
lag(app_ver_name,1,null) over(partition by user_id order by app_ver_name) as ver_2
from t_lag_test) tmp
where ver_2 is not null and app_ver_name>ver_2
;
补充hive的窗口分析函数
测试使用数据
+----------------+---------------------------------+-----------------------+--------------+--+
| t_access.ip | t_access.url | t_access.access_time | t_access.dt |
+----------------+---------------------------------+-----------------------+--------------+--+
| 192.168.33.3 | http://www.edu360.cn/stu | 2017-08-04 15:30:20 | 20170804 |
| 192.168.33.3 | http://www.edu360.cn/teach | 2017-08-04 15:35:20 | 20170804 |
| 192.168.33.4 | http://www.edu360.cn/stu | 2017-08-04 15:30:20 | 20170804 |
| 192.168.33.4 | http://www.edu360.cn/job | 2017-08-04 16:30:20 | 20170804 |
| 192.168.33.5 | http://www.edu360.cn/job | 2017-08-04 15:40:20 | 20170804 |
| 192.168.33.3 | http://www.edu360.cn/stu | 2017-08-05 15:30:20 | 20170805 |
| 192.168.44.3 | http://www.edu360.cn/teach | 2017-08-05 15:35:20 | 20170805 |
| 192.168.33.44 | http://www.edu360.cn/stu | 2017-08-05 15:30:20 | 20170805 |
| 192.168.33.46 | http://www.edu360.cn/job | 2017-08-05 16:30:20 | 20170805 |
| 192.168.33.55 | http://www.edu360.cn/job | 2017-08-05 15:40:20 | 20170805 |
| 192.168.133.3 | http://www.edu360.cn/register | 2017-08-06 15:30:20 | 20170806 |
| 192.168.111.3 | http://www.edu360.cn/register | 2017-08-06 15:35:20 | 20170806 |
| 192.168.34.44 | http://www.edu360.cn/pay | 2017-08-06 15:30:20 | 20170806 |
| 192.168.33.46 | http://www.edu360.cn/excersize | 2017-08-06 16:30:20 | 20170806 |
| 192.168.33.55 | http://www.edu360.cn/job | 2017-08-06 15:40:20 | 20170806 |
| 192.168.33.46 | http://www.edu360.cn/excersize | 2017-08-06 16:30:20 | 20170806 |
| 192.168.33.25 | http://www.edu360.cn/job | 2017-08-06 15:40:20 | 20170806 |
| 192.168.33.36 | http://www.edu360.cn/excersize | 2017-08-06 16:30:20 | 20170806 |
| 192.168.33.55 | http://www.edu360.cn/job | 2017-08-06 15:40:20 | 20170806 |
+----------------+---------------------------------+-----------------------+--------------+--+
执行命令查看测试数据:
select * from t_access;
LAG函数
作用:按照指定列进行分区,将a列(不同于前一列)数据拷贝作为b列,让b列第1行数据与a列第n+1行数据向对应,以此类推。a列前n行数据与默认数据相对应。
应用场景:版本升级轨迹查询。
select ip,url,access_time,
row_number() over(partition by ip order by access_time) as rn,
lag(access_time,1,0) over(partition by ip order by access_time)as last_access_time
from t_access;
LEAD函数
与LAG作用相反。
select ip,url,access_time,
row_number() over(partition by ip order by access_time) as rn,
lead(access_time,1,0) over(partition by ip order by access_time)as last_access_time
from t_access;
FIRST_VALUE 函数
例:取每个用户访问的第一个页面
select ip,url,access_time,
row_number() over(partition by ip order by access_time) as rn,
first_value(url) over(partition by ip order by access_time rows between unbounded preceding and unbounded following)as last_access_time
from t_access;
LAST_VALUE 函数
例:取每个用户访问的最后一个页面
select ip,url,access_time,
row_number() over(partition by ip order by access_time) as rn,
last_value(url) over(partition by ip order by access_time rows between unbounded preceding and unbounded following)as last_access_time
from t_access;
sum() over() 函数
累计报表–分析函数实现版
select id
,month
,sum(amount) over(partition by id order by month rows between unbounded preceding and current row)
from
(select id,month,
sum(fee) as amount
from t_test
group by id,month) tmp;
序列号打印相关函数
row_number() over():比较相等,按任意顺序排列
rank() over():比较相等,序列号相同,默认占去后面的序列号。
dense_rank() over():比较相等,序列号相同,默认不占去后面的序列号。
求薪资排名中位于前1/3的人
ntile(3) over():将总数量的1/n划分为一个相同的序列号。
score rownumber rankover dense_rank ntile
89 1 1 1 1
90 2 2 2 1
90 3 2 2 1
91 4 4 3 2
92 5 5 4 2
92 6 5 4 2
93 7 7 5 3