数据源:ODS_APP_LOG_DEMO
数据文件:
链接:https://pan.baidu.com/s/1s1gytjzZXbWRkWJYUaADTA
提取码:oxmu
解压后将文件夹内的数据放到hdfs上,hadoop hdfs -put xxx xxx
hive>create external table ODS_APP_LOG_DEMO
(
guid bigint,
eventid String,
event Map<String, String>,
uid String,
imei String,
mac String,
imsi String,
osName String,
osVer String,
androidId String,
resolution String,
deviceType String,
deviceId String,
uuid String,
appid String,
appVer String,
release_ch String,
promotion_ch String,
areacode String,
longtitude Double,
latitude Double,
carrier String,
netType String,
cid_sn String,
ip String,
sessionId String,
`timestamp` bigint,
province String,
city String,
district String
)
partitioned by (dt string)
stored as parquet;
hive>load data inpath '/mycluster/user/applogs' into table ODS_APP_LOG_DEMO partition(dt='2023-03-01'); //文件在hdfs上
hive> desc formatted ODS_APP_LOG_DEMO;
hive> select count(1) from ODS_APP_LOG_DEMO;
分析代码:
hive>select
guid,
eventid,
event['pgId'] as page_id,
event['adId'] as ad_id
from
ODS_APP_LOG_DEMO where dt='2023-03-01';
广告id,曝光次数,曝光人数,曝光人数最大页,点击次数,点击人数,点击人数最大页。
hive>create table ADS_APP_ADV_OVW_DEMO(
dt string,
adid string,
show_counts int,
show_users int,
show_max_page String,
click_counts int,
click_users int,
click_max_page string
)
stored as orc;
--广告id,曝光次数,曝光人数,点击次数,点击人数
hive>select
'2023-03-01' as dt,
event['adId'] as adid,
count(if(eventid='adShowEvent',1,null)) as show_counts,
count(distinct (if(eventid='adShowEvent',guid,null))) as show_users,
count(if(eventid='adClickEvent',1,null)) as click_counts,
count(distinct (if(eventid='adClickEvent',guid,null))) as click_users
from ODS_APP_LOG_DEMO where dt='2023-03-01'
group by event['adId'];
结果:
+-------------+-------+--------------+-------------+---------------+--------------+--+
| dt | adid | show_counts | show_users | click_counts | click_users |
+-------------+-------+--------------+-------------+---------------+--------------+--+
| 2023-03-01 | NULL | 0 | 0 | 0 | 0 |
| 2023-03-01 | 0 | 2367 | 209 | 483 | 151 |
| 2023-03-01 | 1 | 2493 | 212 | 333 | 141 |
| 2023-03-01 | 2 | 2466 | 211 | 283 | 133 |
| 2023-03-01 | 3 | 2441 | 209 | 194 | 110 |
| 2023-03-01 | 4 | 2436 | 212 | 158 | 91 |
| 2023-03-01 | 5 | 2514 | 212 | 110 | 80 |
| 2023-03-01 | 6 | 2480 | 209 | 86 | 65 |
| 2023-03-01 | 7 | 2429 | 210 | 64 | 49 |
| 2023-03-01 | 8 | 2467 | 211 | 27 | 22 |
| 2023-03-01 | 9 | 2403 | 213 | 22 | 21 |
+-------------+-------+--------------+-------------+---------------+--------------+--+
广告id,曝光人数最大页,点击人数最大页。
hive>with temp as(
select
event['adId'] as adid,
event['pgId'] as pgid,
eventid,
count(distinct guid) as user_counts
from ODS_APP_LOG_DEMO where dt='2023-03-01'
group by event['adId'],event['pgId'],eventid
),
result as (
select
adid,
pgid,
eventid,
user_counts,
row_number()over(partition by adid,eventid order by user_counts desc) as rn
from temp
)
select
adid,
pgid,
eventid
from result where rn<=2;
点击最大页和曝光最大页的前两名
最终完整代码:
hive>create table ADS_APP_ADV_OVM_DEMO(
dt string,
pgid string,
adid string,
show_counts int,
click_counts int,
show_users int,
click_users int,
click_max_page string,
show_max_page string
)
stored as orc;
with one as(
select
'2023-03-01' as dt,
event['adId'] as adid,
count(if(eventid='adShowEvent',1,null)) as show_counts,
count(distinct if(eventid='adShowEvent',guid,null)) as show_users,
count(if(eventid='adClickEvent',1,null)) as click_counts,
count(distinct if(eventid='adClickEvent',guid,null)) as click_user
from ODS_APP_LOG_DEMO where dt='2023-03-01'
group by event['adId']
),
temp as
(
select
event['adId'] as adid,
event['pgId'] as pgid,
eventid,
count(distinct guid) as user_counts
from ODS_APP_LOG_DEMO where dt='2023-03-01'
group by event['adId'],event['pgId'],eventid
),
result as(
select
adid,
pgid,
eventid,
user_counts,
row_number() over(partition by adid,eventid order by user_counts desc) as rn
from temp
),
two as(
select
adid,
pgid,
eventid,
user_counts
from result where rn=1
)
select
one.adid,
one.show_counts,
one.show_users,
one.click_counts,
one.click_user,
three.user_counts,
four.user_counts
from one join (select * from two where eventid='adShowEvent') three on one.adid=three.adid
join (select * from two where eventid='adClickEvent') four on one.adid=four.adid;
报错:Error: Error while compiling statement: FAILED: SemanticException [Error 10016]: Line 15:6 Argument type mismatch 'rn': The 1st argument of EQUAL OR LESS THAN is expected to a primitive type, but list is
found (state=42000,code=10016)
原因:hive版本过低
hive>show version();
+---------------------------+--+
| _c0 |
+---------------------------+--+
| 1.1.0-cdh5.14.2 rUnknown |
+---------------------------+--+
解决办法:升级hive版本