**
HiveSQL基本查询分析练习
**
**
需求1:某次经营活动中,商家发起了“异性拼团购”活动,选出城市在北京,性别为“女”的十名用户。
SELECT user_name
FROM user_info
WHERE city='beijing' and sex='female'
limite 10;
需求2:选出在2019年6月18日,购买商品品类是food的用户名,购买数量,支付金额。
SELECT user_name,
piece,
pay_amount
FROM user_trade
WHERE dt='2019-06-18' and goods_category='food';
需求3:2019年一月到三月,每个品类有多少人购买,累计金额是多少。
SELECT goods_category,
count(distinct user_name) as user_num,
sum(pay_amount) as total_amount
FROM user_trade
WHERE dt between '2019-01-01' and '2019-03-31'
GROUP BY goods_category;
需求4-1:2019年4月,支付金额超过5万元的用户。
SELECT user_name,
sum(pay_amount) as total_amount
FROM user_trade
WHERE dt between '2019-04-01' and '2019-04-30'
GROUP BY user_name
HAVING sum(pay_amount)>50000;
需求4-2:2019年4月,支付金额top5用户。(order by执行顺序在select之后,所以需要使用重新定义的列名进行排序)
SELECT user_name,
sum(pay_amount) as total_amount
FROM user_trade
WHERE dt between '2019-04-01' and '2019-04-30'
GROUP BY user_name
ORDER BY total_amount DESC
limit 5;
需求5:用户的首次激活时间与2019年5月1日的时间间隔。
SELECT user_name,
datediff('2019-05-01',to_date(firstactivetime))
FROM user_info;
需求6:对用户的年龄段进行分析,统计20岁以下,20到30,30到40,40岁以上的用户数。
SELECT case when age<20 then '20以下'
when age>=20 and age<30 then '20-30'
when age>=30 and age<40 then '30-40'
else '40以上' end as age_type,
count(distinct user_id) as user_num
FROM user_info
GROUP BY case when age<20 then '20以下'
when age>=20 and age<30 then '20-30'
when age>=30 and age<40 then '30-40'
else '40以上' end;
需求7:统计每个性别用户等级高低的分布情况。
SELECT sex,
if(level>5,'高','低') as level_type,
count(distinct user_id) as user_num
FROM user_info
GROUP BY sex,
if(level>5,'高','低');
需求8:分析每个月的拉新情况。
SELECT substr(firstactivetime,1,7) as month,
count(distinct user_id) user_num
FROM user_info
GROUP BY substr(firstactivetime,1,7);
需求9:不同手机品牌的用户数:
SELECT substr(extra2['phonebrand'],1,6) as phone_brand,
count(distinct user_id) user_num
FROM user_info
GROUP BY substr(extra2['phonebrand'],1,6);
需求10:ELLA用户的2018年平均每次支付金额,以及最大最小支付日期的间隔:
SELECT avg(pay_amount) as avg_amount,
datediff(max(from_unixtime(pay_time,'yyyy-MM-dd')),min(from_unixtime(pay_time,'yyyy-MM-dd'))) as day
FROM user_trade
WHERE year(dt)='2018' and user_name='ELLA';
需求11:2018年购买的商品品类在两个以上的用户。
SELECT count(a.user_name)
FROM
(SELECT user_name,
count(distinct goods_category)as category_num
FROM user_trade
WHERE year(dt)='2018'
GROUP BY user_name
HAVING count(distinct goods_category)>2) a;
需求12:用户激活时间在2018年,年龄段在20-30岁和30-40岁的婚姻状况分布。
SELECT a.age_type,
if(a.marriage_status=1,'已婚','未婚'),
count(distinct a.user_id)
FROM
(SELECT case when age<20 then '20以下'
when age>=20 and age<30 then '20-30岁'
when age>=30 and age<40 then '30-40岁'
else '40以上' end as age_type,
get_json_object(extra1,'$.marriage_status') as marriage_status,
user_id
FROM user_info
WHERE to_date(firstactivetime) between '2018-01-01' and '2018-12-31') a
WHERE a.age_type in ('20-30岁','30-40岁')
GROUP BY a.age_type,
if(a.marriage_status=1,'已婚','未婚');
需求13:激活天数距今超过300天的男女分布情况。
SELECT sex,
count(distinct user_id)
FROM user_info
WHERE datediff(current_date(),to_date(firstactivetime))>300
GROUP BY sex;
需求14:不同性别,教育程度的分布情况。
SELECT sex,
extra2['education'] as education,
count(distinct user_id)
FROM user_info
GROUP BY sex,
extra2['education'];
需求15:2019年1月1日到2019年4月30日,每个时段的不同品类购买金额分布。
SELECT substr(from_unixtime(pay_time,'yyyy-MM-dd HH'),12),
goods_category,
sum(pay_amount)
FROM user_trade
WHERE dt between '2019-01-01' and '2019-04-30'
GROUP BY substr(from_unixtime(pay_time,'yyyy-MM-dd HH'),12),
goods_category;