多维度统计分析茄子快传用户信息
数据截图:
1.MR编程,将JSON数据转化,剔除脏数据,上传到Linux的
package nuc.edu.ls;
import java.io.IOException;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
public class AppLogClean {
public static class MapTask extends Mapper<LongWritable, Text, Text, NullWritable> {
StringBuilder sb = new StringBuilder();
Text k = new Text();
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context)
throws IOException, InterruptedException {
// 得到每行数据
String line = value.toString();
JSONObject ob1 = JSON.parseObject(line);
if (ob1 != null) {
JSONObject ob2 = ob1.getJSONObject("header");
if (ob2 != null) {
// 关键数据是否有丢失
// release_channel,device_id,city,device_id_type,app_ver_name
// 这几个字段如果缺失,则过滤
if (StringUtils.isBlank(ob2.getString("release_channel"))
|| StringUtils.isBlank(ob2.getString("device_id"))
|| StringUtils.isBlank(ob2.getString("city"))
|| StringUtils.isBlank(ob2.getString("device_id_type"))
|| StringUtils.isBlank(ob2.getString("app_ver_name"))
|| StringUtils.isBlank(ob2.getString("os_name"))
|| StringUtils.isBlank(ob2.getString("mac"))) {
return;
}
if (ob2.getString("app_ver_name").equals("android")) {
if (StringUtils.isBlank(ob2.getString("android_id"))) {
return;
}
}
sb.append(ob2.getString("cid_sn")).append(",");
sb.append(ob2.getString("mobile_data_type")).append(",");
sb.append(ob2.getString("os_ver")).append(",");
sb.append(ob2.getString("mac")).append(",");
sb.append(ob2.getString("resolution")).append(",");
sb.append(ob2.getString("commit_time")).append(",");
sb.append(ob2.getString("sdk_ver")).append(",");
sb.append(ob2.getString("device_id_type")).append(",");
sb.append(ob2.getString("city")).append(",");
sb.append(ob2.getString("android_id")).append(",");
sb.append(ob2.getString("device_model")).append(",");
sb.append(ob2.getString("carrier")).append(",");
sb.append(ob2.getString("promotion_channel")).append(",");
sb.append(ob2.getString("app_ver_name")).append(",");
sb.append(ob2.getString("imei")).append(",");
sb.append(ob2.getString("app_ver_code")).append(",");
sb.append(ob2.getString("pid")).append(",");
sb.append(ob2.getString("net_type")).append(",");
sb.append(ob2.getString("device_id")).append(",");
sb.append(ob2.getString("app_device_id")).append(",");
sb.append(ob2.getString("release_channel")).append(",");
sb.append(ob2.getString("country")).append(",");
sb.append(ob2.getString("time_zone")).append(",");
sb.append(ob2.getString("os_name")).append(",");
sb.append(ob2.getString("manufacture")).append(",");
sb.append(ob2.getString("commit_id")).append(",");
sb.append(ob2.getString("app_token")).append(",");
sb.append(ob2.getString("account")).append(",");
sb.append(ob2.getString("app_id")).append(",");
sb.append(ob2.getString("build_num")).append(",");
sb.append(ob2.getString("language")).append(",");
String uid = ob2.getString("mac");// ???
sb.append(uid);
k.set(sb.toString());
context.write(k, NullWritable.get());
// 清除sb的数据
sb.delete(0, sb.length());
}
}
}
}
public static void main(String[] args) throws Exception {
System.setProperty("HADDOP_USER_NAME", "root");
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
// 设置map,设置driver,设置输出类型。。。
job.setJarByClass(AppLogClean.class);
job.setMapperClass(MapTask.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputDirRecursive(job, true);
FileInputFormat.addInputPath(job, new Path("d:/AppData/appuserdata/input/"));
FileOutputFormat.setOutputPath(job, new Path("d:/AppData/appuserdata/out"));
// 不需要reduce 可以设置为0
job.setNumReduceTasks(0);
boolean ret = job.waitForCompletion(true);
System.exit(ret ? 0 : 1);
}
}
2.使用蜂房进行各种数据统计
每天的活跃用户
/** 创建表 用来加载清洗好的数据 **/
create table ods_app_log(
cid_sn string,
mobile_data_type string,
os_ver string,
mac string,
resolution string,
commit_time string,
sdk_ver string,
device_id_type string,
city string,
android_id string,
device_model string,
carrier string,
promotion_channel string,
app_ver_name string,
imei string,
app_ver_code string,
pid string,
net_type string,
device_id string,
app_device_id string,
release_channel string,
country string,
time_zone string,
os_name string,
manufacture string,
commit_id string,
app_token string,
account string,
app_id string,
build_num string,
language string,
uid string
)
partitioned by(day string)
row format delimited
fields terminated by ",";
/** 加载数据 **/
load data local inpath "/root/data/20170101" into table ods_app_log partition(day="20170101");
/** 创建活跃用户的表 **/
create table etl_user_active_day(
uid string,
commit_time string,
city string,
release_channel string,
app_ver_name string
)partitioned by (day string);
/** 计算数据到活跃用户表 **/
/** 20170101 **/
insert into table etl_user_active_day partition(day="20170101")
select uid,commit_time,city , release_channel,app_ver_name
from
(select
uid,commit_time, city , release_channel,app_ver_name,
row_number() over(partition by uid order by commit_time) as rn
from ods_app_log where day="20170101") tmp1
where tmp1.rn=1;
insert into table etl_user_active_day partition(day="20170102")
select uid,commit_time,city , release_channel,app_ver_name
from
(select
uid,commit_time, city , release_channel,app_ver_name,
row_number() over(partition by uid order by commit_time) as rn
from ods_app_log where day="20170102") tmp1
where tmp1.rn=1;
/** 分维度进行统计 **/
时间 城市 渠道 版本 活跃用户
当天 0 0 0
当天 0 0 1
当天 0 1 0
当天 0 1 1
当天 1 0 0
当天 1 0 1
当天 1 1 0
当天 1 1 1
/** 创建维度统计表 **/
create table dim_user_active(
city string,
release_channel string,
app_ver_name string,
active_user_cnt int
)partitioned by (day string,flag string);
//多重插入 读一次数据,多次计算
from etl_user_active_day
insert into table dim_user_active partition(day="20170101",flag="000")
select "all","all","all",count(1) where day="20170101"
insert into table dim_user_active partition(day="20170101",flag="001")
select "all","all",app_ver_name,count(1) where day="20170101"
group by app_ver_name
insert into table dim_user_active partition(day="20170101",flag="010")
select "all",release_channel,"all",count(1) where day="20170101"
group by release_channel
insert into table dim_user_active partition(day="20170101",flag="011")
select "all",release_channel,app_ver_name,count(1) where day="20170101"
group by release_channel,app_ver_name
insert into table dim_user_active partition(day="20170101",flag="100")
select city,"all","all",count(1) where day="20170101"
group by city
insert into table dim_user_active partition(day="20170101",flag="101")
select city,"all",app_ver_name,count(1) where day="20170101"
group by city,app_ver_name
insert into table dim_user_active partition(day="20170101",flag="110")
select city,release_channel,"all",count(1) where day="20170101"
group by city,release_channel
insert into table dim_user_active partition(day="20170101",flag="111")
select city,release_channel,app_ver_name,count(1) where day="20170101"
group by city,release_channel,app_ver_name;