旅游集市数仓建设

hive> create database ods;
OK
Time taken: 0.102 seconds
hive> create database dwd;
OK
Time taken: 0.017 seconds
hive> create database dwm;
OK
Time taken: 0.031 seconds
hive> create database dws;
OK
Time taken: 0.017 seconds
hive> create database ads;
OK
Time taken: 0.018 seconds
hive> create database dim;
OK
Time taken: 0.031 seconds
hive> show databases;
OK
ads
default
dim
dwd
dwm
dws
ods
Time taken: 0.005 seconds, Fetched: 7 row(s)
[root@master soft]# cd ctyun/
[root@master ctyun]# ls
dim_usertag_msk_m  ods_ddr  ods_oidd
dwd_merge          ods_dpi  ods_wcdr
hive> use ods;
OK
Time taken: 0.046 seconds
hive> CREATE EXTERNAL TABLE IF NOT EXISTS ods.ods_oidd(
    >     mdn string comment '手机号码'  
    >     ,start_time string comment '业务开始时间'  
    >     ,county_id string comment '区县编码'  
    >     ,longi string comment '经度'  
    >     ,lati string comment '纬度'  
    >     ,bsid string comment '基站标识'  
    >     ,grid_id string comment '网格号'  
    >     ,biz_type string comment '业务类型'  
    >     ,event_type string comment '事件类型'  
    >     ,data_source string comment '数据源'  
    > ) 
    > comment  'oidd位置数据表'
    > PARTITIONED BY (
    >     day_id string comment '天分区'  
    > ) 
    > ROW FORMAT DELIMITED 
    >     FIELDS TERMINATED BY '\t' 
    > STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' 
    >     OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'  
    > location '/data/tour/ods/ods_oidd'; 
OK
Time taken: 0.13 seconds
hive> CREATE EXTERNAL TABLE IF NOT EXISTS ods.ods_wcdr (
    >     mdn string comment '手机号码'  
    >     ,start_time string comment '业务开始时间'  
    >     ,county_id string comment '区县编码'  
    >     ,longi string comment '经度'  
    >     ,lati string comment '纬度'  
    >     ,bsid string comment '基站标识'  
    >     ,grid_id string comment '网格号'  
    >     ,biz_type string comment '业务类型'  
    >     ,event_type string comment '事件类型'  
    >     ,data_source string comment '数据源'  
    > ) 
    > comment  'wcdr位置数据表'
    > PARTITIONED BY (
    >     day_id string comment '天分区'  
    > ) 
    > ROW FORMAT DELIMITED 
    >     FIELDS TERMINATED BY '\t' 
    > STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' 
    >     OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'  
    > location '/data/tour/ods/ods_wcdr'; 
OK
Time taken: 0.037 seconds
hive> CREATE EXTERNAL TABLE IF NOT EXISTS ods.ods_ddr(
    >     mdn string comment '手机号码'  
    >     ,start_time string comment '业务开始时间'  
    >     ,county_id string comment '区县编码'  
    >     ,longi string comment '经度'  
    >     ,lati string comment '纬度'  
    >     ,bsid string comment '基站标识'  
    >     ,grid_id string comment '网格号'  
    >     ,biz_type string comment '业务类型'  
    >     ,event_type string comment '事件类型'  
    >     ,data_source string comment '数据源'  
    > ) 
    > comment  'ddr位置数据表'
    > PARTITIONED BY (
    >     day_id string comment '天分区'  
    > ) 
    > ROW FORMAT DELIMITED 
    >     FIELDS TERMINATED BY '\t' 
    > STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' 
    >     OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'  
    > location '/data/tour/ods/ods_ddr'; 
OK
Time taken: 0.035 seconds
hive> CREATE EXTERNAL TABLE IF NOT EXISTS ods.ods_dpi(
    >     mdn string comment '手机号码'  
    >     ,start_time string comment '业务开始时间'  
    >     ,county_id string comment '区县编码'  
    >     ,longi string comment '经度'  
    >     ,lati string comment '纬度'  
    >     ,bsid string comment '基站标识'  
    >     ,grid_id string comment '网格号'  
    >     ,biz_type string comment '业务类型'  
    >     ,event_type string comment '事件类型'  
    >     ,data_source string comment '数据源'  
    > ) 
    > comment  'dpi位置数据表'
    > PARTITIONED BY (
    >     day_id string comment '天分区'  
    > ) 
    > ROW FORMAT DELIMITED 
    >     FIELDS TERMINATED BY '\t' 
    > STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' 
    >     OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'  
    > location '/data/tour/ods/ods_dpi'; 
OK
Time taken: 0.042 seconds
hive> show tables;
OK
ods_ddr
ods_dpi
ods_oidd
ods_wcdr
Time taken: 0.012 seconds, Fetched: 4 row(s)
hive> load data local inpath '/usr/local/soft/ctyun/ods_oidd/day_id=20180503/*' into table ods_oidd partition(day_id=20180503);
Loading data to table ods.ods_oidd partition (day_id=20180503)
Partition ods.ods_oidd{day_id=20180503} stats: [numFiles=10, totalSize=789852206]
OK
Time taken: 9.502 seconds
hive> select count(*) from ods_oidd;
OK
5196414
Time taken: 28.53 seconds, Fetched: 1 row(s)
hive> load data local inpath '/usr/local/soft/ctyun/ods_wcdr/day_id=20180503/*' into table ods_wcdr partition(day_id=20180503);
Loading data to table ods.ods_wcdr partition (day_id=20180503)
Partition ods.ods_wcdr{day_id=20180503} stats: [numFiles=10, totalSize=790241502]
OK
Time taken: 7.215 seconds
hive> load data local inpath '/usr/local/soft/ctyun/ods_ddr/day_id=20180503/*' into table ods_ddr partition(day_id=20180503);
Loading data to table ods.ods_ddr partition (day_id=20180503)
Partition ods.ods_ddr{day_id=20180503} stats: [numFiles=10, totalSize=784969334]
OK
Time taken: 6.652 seconds
hive> load data local inpath '/usr/local/soft/ctyun/ods_dpi/day_id=20180503/*' into table ods_dpi partition(day_id=20180503);
Loading data to table ods.ods_dpi partition (day_id=20180503)
Partition ods.ods_dpi{day_id=20180503} stats: [numFiles=10, totalSize=784530290]
OK
Time taken: 7.367 seconds
hive> use dwd;
OK
Time taken: 0.013 seconds
hive> CREATE EXTERNAL TABLE IF NOT EXISTS dwd.dwd_res_regn_mergelocation_msk_d (
    >     mdn string comment '手机号码'  
    >     ,start_time string comment '业务开始时间'  
    >     ,county_id string comment '区县编码'  
    >     ,longi string comment '经度'  
    >     ,lati string comment '纬度'  
    >     ,bsid string comment '基站标识'  
    >     ,grid_id string comment '网格号'  
    >     ,biz_type string comment '业务类型'  
    >     ,event_type string comment '事件类型'  
    >     ,data_source string comment '数据源'  
    > ) 
    > comment  '位置数据融合表'
    > PARTITIONED BY (
    >     day_id string comment '天分区'  
    > ) 
    > ROW FORMAT DELIMITED 
    >     FIELDS TERMINATED BY '\t' 
    > STORED AS ORC
    > location '/data/tour/dwd/dwd_res_regn_mergelocation_msk_d'; 
OK
Time taken: 0.062 seconds
hive> alter table dwd.dwd_res_regn_mergelocation_msk_d add partition(day_id=20180503);
OK
Time taken: 0.041 seconds
hive> insert into table dwd.dwd_res_regn_mergelocation_msk_d partition(day_id="20180503")
    > select  mdn  
    >         ,start_time  
    >         ,county_id  
    >         ,longi  
    >         ,lati  
    >         ,bsid  
    >         ,grid_id  
    >         ,biz_type  
    >         ,event_type  
    >         ,data_source 
    > from ods.ods_oidd
    > where day_id = "20180503"
    > union all
    > select  mdn  
    >         ,start_time  
    >         ,county_id  
    >         ,longi  
    >         ,lati  
    >         ,bsid  
    >         ,grid_id  
    >         ,biz_type  
    >         ,event_type  
    >         ,data_source 
    > from ods.ods_wcdr
    > where day_id = "20180503"
    > union all
    > select  mdn  
    >         ,start_time  
    >         ,county_id  
    >         ,longi  
    >         ,lati  
    >         ,bsid  
    >         ,grid_id  
    >         ,biz_type  
    >         ,event_type  
    >         ,data_source 
    > from ods.ods_dpi
    > where day_id = "20180503"
    > union all
    > select  mdn  
    >         ,start_time  
    >         ,county_id  
    >         ,longi  
    >         ,lati  
    >         ,bsid  
    >         ,grid_id  
    >         ,biz_type  
    >         ,event_type  
    >         ,data_source 
    > from ods.ods_ddr
    > where day_id = "20180503";
hive> use dwm;
OK
Time taken: 0.009 seconds
hive> CREATE EXTERNAL TABLE IF NOT EXISTS dwm.dwm_staypoint_msk_d (
    >     mdn string comment '用户手机号码'  
    >     ,longi string comment '网格中心点经度'  
    >     ,lati string comment '网格中心点纬度'  
    >     ,grid_id string comment '停留点所在电信内部网格号'  
    >     ,county_id string comment '停留点区县'  
    >     ,duration string comment '机主在停留点停留的时间长度(e'  ),lTime-eTim 
    >     ,grid_first_time string comment '网格第一个记录位置点时间(秒级)'  
    >     ,grid_last_time string comment '网格最后一个记录位置点时间(秒级)'  
    > ) 
    > comment  '停留点表'
    > PARTITIONED BY (
    >     day_id string comment '天分区'  
    > ) 
    > ROW FORMAT DELIMITED 
    >     FIELDS TERMINATED BY '\t' 
    > STORED AS ORC
    > location '/data/tour/dwm/dwm_staypoint_msk_d'; 
OK
Time taken: 0.037 seconds
hive> add jar /usr/local/soft/jars/HiveUDF-1.0.jar;
Added [/usr/local/soft/jars/HiveUDF-1.0.jar] to class path
Added resources: [/usr/local/soft/jars/HiveUDF-1.0.jar]
hive> create temporary function get_points as 'ctyun.udf.getPointsUDF';
OK
Time taken: 0.009 seconds
hive> create temporary function dateBetweenUDF as 'ctyun.udf.dateBetweenUDF';
OK
Time taken: 0.004 seconds
hive> create temporary function calLength as 'ctyun.udf.calLength';
OK
Time taken: 0.005 seconds
hive> create temporary function get_city_or_prov_id as 'ctyun.udf.getCityIdOrProvID';
OK
Time taken: 0.002 seconds
hive> insert into table dwm.dwm_staypoint_msk_d partition(day_id=20180503)
    > select  t1.mdn
    >         ,get_points(grid_id)[0] as longi
    >         ,get_points(grid_id)[1] as lati
    >         ,t1.grid_id
    >         ,t1.county_id
    >         ,dateBetweenUDF(t1.grid_first_time,t1.grid_last_time) as duration
    >         ,t1.grid_first_time
    >         ,t1.grid_last_time
    > from (
    >     select  mdn
    >             ,grid_id
    >             ,county_id
    >             ,min(split(start_time,',')[0]) as grid_first_time
    >             ,max(split(start_time,',')[1]) as grid_last_time
    >     from dwd.dwd_res_regn_mergelocation_msk_d
    >     where day_id="20180503"
    >     group by mdn, grid_id, county_id
    > )t1;
hive> CREATE EXTERNAL TABLE IF NOT EXISTS dim.dim_usertag_msk_m (
    >     mdn string comment '手机号大写MD5加密'  
    >     ,name string comment '姓名'  
    >     ,gender string comment '性别,1男2女'  
    >     ,age string comment '年龄'  
    >     ,id_number string comment '证件号码'  
    >     ,number_attr string comment '号码归属地'  
    >     ,trmnl_brand string comment '终端品牌'    
    >     ,trmnl_price string comment '终端价格'
    >     ,packg string comment '套餐'  
    >     ,conpot string comment '消费潜力'  
    >     ,resi_grid_id string comment '常住地网格'  
    >     ,resi_county_id string comment '常住地区县'  
    > ) 
    > comment  '用户画像表'
    > PARTITIONED BY (
    >     month_id string comment '月分区'  
    > ) 
    > ROW FORMAT DELIMITED 
    >     FIELDS TERMINATED BY '\t' 
    > STORED AS PARQUET
    > location '/data/tour/dim/dim_usertag_msk_m'; 
OK
Time taken: 0.041 seconds
hive> alter table dim.dim_usertag_msk_m add partition(month_id=201805);
OK
Time taken: 0.046 seconds
hive> load data local inpath '/usr/local/soft/ctyun/dim_usertag_msk_m/month_id=201805/*' into table dim.dim_usertag_msk_m partition(month_id=201805);
Loading data to table dim.dim_usertag_msk_m partition (month_id=201805)
Partition dim.dim_usertag_msk_m{month_id=201805} stats: [numFiles=10, totalSize=91725546]
OK
Time taken: 1.932 seconds
hive> CREATE EXTERNAL TABLE IF NOT EXISTS dws.dws_county_tourist_msk_d (
    >     mdn string comment '手机号大写MD5加密'  
    >     ,source_county_id string comment '游客来源区县'  
    >     ,d_county_id string comment '旅游目的地县代码'  
    >     ,d_stay_time double comment '游客在该县停留的时间长度(小时)'  
    >     ,d_max_distance double comment '游客本次出游距离'  
    > ) 
    > comment  '旅游应用专题数据县级别-天'
    > PARTITIONED BY (
    >     day_id string comment '日分区'  
    > ) 
    > ROW FORMAT DELIMITED 
    >     FIELDS TERMINATED BY '\t' 
    > STORED AS PARQUET
    > location '/data/tour/dws/dws_county_tourist_msk_d'; 
OK
Time taken: 0.041 seconds
hive> insert into table dws.dws_county_tourist_msk_d partition(day_id="20180503")
    > select  ttt1.mdn
    >         ,ttt1.source_county_id
    >         ,ttt1.d_county_id
    >         ,ttt1.d_stay_time
    >         ,ttt1.d_max_distance
    > from(
    >         select  mdn
    >                 ,resi_county_id as source_county_id
    >                 ,county_id as d_county_id
    >                 ,sum(duration) as d_stay_time
    >                 ,max(calLength(tt1.grid_id,tt1.resi_grid_id)) as d_max_distance
    >         from(
    >                 select  t1.mdn
    >                         ,t1.grid_id
    >                         ,t1.county_id
    >                         ,t1.duration
    >                         ,t2.resi_county_id
    >                         ,t2.resi_grid_id
    >                 from (
    >                         select  mdn
    >                                 ,grid_id
    >                                 ,county_id
    >                                 ,duration
    >                         from dwm.dwm_staypoint_msk_d
    >                         where day_id='20180503'
    >                 ) t1 join(
    >                         select  mdn
    >                                 ,resi_county_id
    >                                 ,resi_grid_id
    >                         from dim.dim_usertag_msk_m
    >                         where month_id='201805'
    >                 ) t2 on t1.mdn = t2.mdn
    >         ) tt1 group by tt1.mdn,tt1.county_id,tt1.resi_county_id
    > )ttt1 where d_stay_time > 180 and d_max_distance > 10000
    > ;
hive> select * from dws_county_tourist_msk_d
    > where day_id=20180503
    > limit 10;
hive> select  t1.d_county_id
    >         ,count(*) as d_county_cnt
    > from (
    >     select  d_county_id
    >     from dws.dws_county_tourist_msk_d
    >     where day_id="20180503"
    > ) t1 group by t1.d_county_id
    > order by d_county_cnt desc limit 20;
hive> select  t1.d_county_id
    >         ,t2.gender
    >         ,count(*) as d_county_gender_cnt
    > from(
    >     select  mdn
    >             ,d_county_id
    >     from dws.dws_county_tourist_msk_d
    >     where day_id="20180503"
    > ) t1 left join (
    >     select  mdn
    >             ,gender
    >     from dim.dim_usertag_msk_m
    >     where month_id=20180503
    > ) t2 on t1.mdn = t2.mdn
    > group by t1.d_county_id,t2.gender;
hive> select  t1.d_county_id
    >         ,t2.age
    >         ,count(*) as d_county_age_cnt
    > from(
    >     select  mdn
    >             ,d_county_id
    >     from dws.dws_county_tourist_msk_d
    >     where day_id="20180503"
    > ) t1 left join (
    >     select  mdn
    >             ,age
    >     from dim.dim_usertag_msk_m
    >     where month_id=20180503
    > ) t2 on t1.mdn = t2.mdn
    > group by t1.d_county_id,t2.age;
hive> select  t1.d_county_id
    >         ,t2.number_attr
    >         ,count(*) as d_county_number_attr_cnt
    > from(
    >     select  mdn
    >             ,d_county_id
    >     from dws.dws_county_tourist_msk_d
    >     where day_id="20180503"
    > ) t1 left join (
    >     select  mdn
    >             ,number_attr
    >     from dim.dim_usertag_msk_m
    >     where month_id=20180503
    > ) t2 on t1.mdn = t2.mdn
    > group by t1.d_county_id,t2.number_attr;

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值