HiveSQL基本查询分析练习

最新推荐文章于 2023-11-05 01:41:19 发布

XSTARK51416

最新推荐文章于 2023-11-05 01:41:19 发布

阅读量312

点赞数

本文链接：https://blog.csdn.net/qq_37936002/article/details/105244205

版权

HiveSQL基本查询分析练习

需求1：某次经营活动中，商家发起了“异性拼团购”活动，选出城市在北京，性别为“女”的十名用户。

SELECT user_name
FROM user_info
WHERE city='beijing' and sex='female'
limite 10;

需求2：选出在2019年6月18日，购买商品品类是food的用户名，购买数量，支付金额。

SELECT user_name,
	   piece,
	   pay_amount
FROM user_trade
WHERE dt='2019-06-18' and goods_category='food';

需求3：2019年一月到三月，每个品类有多少人购买，累计金额是多少。

SELECT goods_category,
       count(distinct user_name) as user_num,
       sum(pay_amount) as total_amount
FROM user_trade
WHERE dt between '2019-01-01' and '2019-03-31'
GROUP BY goods_category;

需求4-1：2019年4月，支付金额超过5万元的用户。

SELECT user_name,
       sum(pay_amount) as total_amount
FROM user_trade
WHERE dt between '2019-04-01' and '2019-04-30'
GROUP BY user_name
HAVING sum(pay_amount)>50000;

需求4-2：2019年4月，支付金额top5用户。（order by执行顺序在select之后，所以需要使用重新定义的列名进行排序）

SELECT user_name,
       sum(pay_amount) as total_amount
FROM user_trade
WHERE dt between '2019-04-01' and '2019-04-30'
GROUP BY user_name
ORDER BY total_amount DESC
limit 5;

需求5：用户的首次激活时间与2019年5月1日的时间间隔。

SELECT user_name,
       datediff('2019-05-01',to_date(firstactivetime))
FROM user_info;

需求6：对用户的年龄段进行分析，统计20岁以下，20到30，30到40，40岁以上的用户数。

SELECT case when age<20 then '20以下'
            when age>=20 and age<30 then '20-30'
            when age>=30 and age<40 then '30-40'
            else '40以上' end as age_type,
       count(distinct user_id) as user_num
FROM user_info
GROUP BY case when age<20 then '20以下'
            when age>=20 and age<30 then '20-30'
            when age>=30 and age<40 then '30-40'
            else '40以上' end;

需求7：统计每个性别用户等级高低的分布情况。

SELECT sex,
       if(level>5,'高','低') as level_type,
       count(distinct user_id) as user_num
FROM user_info
GROUP BY sex,
         if(level>5,'高','低');

需求8：分析每个月的拉新情况。

SELECT substr(firstactivetime,1,7) as month,
       count(distinct user_id) user_num
FROM user_info
GROUP BY substr(firstactivetime,1,7);

需求9：不同手机品牌的用户数：

SELECT substr(extra2['phonebrand'],1,6) as phone_brand,
       count(distinct user_id) user_num
FROM user_info
GROUP BY substr(extra2['phonebrand'],1,6);

需求10：ELLA用户的2018年平均每次支付金额，以及最大最小支付日期的间隔：

SELECT avg(pay_amount) as avg_amount,
       datediff(max(from_unixtime(pay_time,'yyyy-MM-dd')),min(from_unixtime(pay_time,'yyyy-MM-dd'))) as day
FROM user_trade
WHERE year(dt)='2018' and user_name='ELLA';

需求11：2018年购买的商品品类在两个以上的用户。

SELECT count(a.user_name)
FROM
    (SELECT user_name,
            count(distinct goods_category)as category_num
     FROM user_trade
     WHERE year(dt)='2018'
     GROUP BY user_name
     HAVING count(distinct goods_category)>2) a;

需求12：用户激活时间在2018年，年龄段在20-30岁和30-40岁的婚姻状况分布。

SELECT a.age_type,
       if(a.marriage_status=1,'已婚','未婚'),
       count(distinct a.user_id)
FROM
       (SELECT case when age<20 then '20以下'
                    when age>=20 and age<30 then '20-30岁'
                    when age>=30 and age<40 then '30-40岁'
                    else '40以上' end as age_type,
               get_json_object(extra1,'$.marriage_status') as marriage_status,
               user_id
        FROM user_info
        WHERE to_date(firstactivetime) between '2018-01-01' and '2018-12-31') a
WHERE a.age_type in ('20-30岁','30-40岁')
GROUP BY a.age_type,
         if(a.marriage_status=1,'已婚','未婚');

需求13：激活天数距今超过300天的男女分布情况。

SELECT sex,
       count(distinct user_id)
FROM user_info
WHERE datediff(current_date(),to_date(firstactivetime))>300
GROUP BY sex;

需求14：不同性别，教育程度的分布情况。

SELECT sex,
       extra2['education'] as education,
       count(distinct user_id)
FROM user_info
GROUP BY sex,
         extra2['education'];

需求15：2019年1月1日到2019年4月30日，每个时段的不同品类购买金额分布。

SELECT substr(from_unixtime(pay_time,'yyyy-MM-dd HH'),12),
       goods_category,
       sum(pay_amount)
FROM user_trade
WHERE dt between '2019-01-01' and '2019-04-30'
GROUP BY substr(from_unixtime(pay_time,'yyyy-MM-dd HH'),12),
               goods_category;