大数据HQL笔试题
文章目录
01) first_value
问题[字节跳动]
学生每做一道题目就会在表里记录一条,求8月22号学生答的第一题以及得分,最后一题以及得分
原数据如下
+-------------------+-----------------------+-----------------+----------------------+
| user_id | question_id | score | record_time |
+-------------------+-----------------------+-----------------+----------------------+
| 1001 | 1 | 51.01 | 2020-08-22 01:00:00 |
| 1001 | 2 | 52.01 | 2020-08-22 02:00:00 |
| 1001 | 3 | 53.01 | 2020-08-22 03:00:00 |
| 1001 | 4 | 54.01 | 2020-08-22 04:00:00 |
| 1002 | 5 | 61.01 | 2020-08-22 05:00:00 |
| 1002 | 6 | 57.01 | 2020-08-22 06:00:00 |
| 1003 | 7 | 51.01 | 2020-08-22 07:00:00 |
| 1004 | 8 | 51.01 | 2020-08-22 08:00:00 |
| 1005 | 9 | 51.01 | 2020-08-23 09:00:00 |
+-------------------+-----------------------+-----------------+----------------------+
目标输出结果如下
+------------+----------------------+-------------------------+---------------------+------------------------+
| user_id | first_question_id | first_question_score | last_question_id | last_question_scroe |
+------------+----------------------+-------------------------+---------------------+------------------------+
| 1001 | 1 | 51.01 | 4 | 54.01 |
| 1002 | 5 | 61.01 | 6 | 57.01 |
| 1003 | 7 | 51.01 | 7 | 51.01 |
| 1004 | 8 | 51.01 | 8 | 51.01 |
+------------+----------------------+-------------------------+---------------------+------------------------+
建表语句
-- DROP DATABASE funcdb;
CREATE DATABASE IF NOT EXISTS funcdb;
USE funcdb;
DROP TABLE IF EXISTS funcdb.tbl_exam;
CREATE TABLE funcdb.tbl_exam (
user_id BIGINT,
question_id BIGINT,
score DECIMAL(5, 2),
record_time TIMESTAMP
);
INSERT OVERWRITE TABLE funcdb.tbl_exam
SELECT 1001 AS user_id, 1 AS question_id, 51.01 AS score, '2020-08-22 01:00:00' AS record_time UNION ALL
SELECT 1001 AS user_id, 2 AS question_id, 52.01 AS score, '2020-08-22 02:00:00' AS record_time UNION ALL
SELECT 1001 AS user_id, 3 AS question_id, 53.01 AS score, '2020-08-22 03:00:00' AS record_time UNION ALL
SELECT 1001 AS user_id, 4 AS question_id, 54.01 AS score, '2020-08-22 04:00:00' AS record_time UNION ALL
SELECT 1002 AS user_id, 5 AS question_id, 61.01 AS score, '2020-08-22 05:00:00' AS record_time UNION ALL
SELECT 1002 AS user_id, 6 AS question_id, 57.01 AS score, '2020-08-22 06:00:00' AS record_time UNION ALL
SELECT 1003 AS user_id, 7 AS question_id, 51.01 AS score, '2020-08-22 07:00:00' AS record_time UNION ALL
SELECT 1004 AS user_id, 8 AS question_id, 51.01 AS score, '2020-08-22 08:00:00' AS record_time UNION ALL
SELECT 1005 AS user_id, 9 AS question_id, 51.01 AS score, '2020-08-23 09:00:00' AS record_time;
SELECT * FROM funcdb.tbl_exam WHERE SUBSTR(record_time, 1, 10) = '2020-08-22';
答案
SELECT user_id, first_question_id, first_question_scroe, last_question_id, last_question_score
FROM (SELECT user_id,
first_value(question_id) OVER (PARTITION BY user_id ORDER BY record_time) AS first_question_id,
first_value(score) OVER (PARTITION BY user_id ORDER BY record_time) AS first_question_scroe,
first_value(question_id) OVER (PARTITION BY user_id ORDER BY record_time DESC) AS last_question_id,
first_value(score) OVER (PARTITION BY user_id ORDER BY record_time DESC) AS last_question_score
FROM funcdb.tbl_exam
WHERE SUBSTR(record_time, 1, 10) = '2020-08-22') AS t1
GROUP BY user_id, first_question_id, first_question_scroe, last_question_id, last_question_score;
-- 或者
SELECT user_id,
first_question_id,
first_question_score,
last_question_id,
last_question_score
FROM (SELECT user_id,
first_value(question_id) OVER (PARTITION BY user_id ORDER BY record_time) AS first_question_id,
first_value(score) OVER (PARTITION BY user_id ORDER BY record_time) AS first_question_score,
last_value(question_id) OVER (PARTITION BY user_id ORDER BY record_time) AS last_question_id,
last_value(score) OVER (PARTITION BY user_id ORDER BY record_time) AS last_question_score,
row_number() OVER (PARTITION BY user_id ORDER BY record_time DESC) AS row_num
FROM funcdb.tbl_exam
WHERE SUBSTR(record_time, 1, 10) = '2020-08-22') AS t1
WHERE row_num = 1;
02) 连续3天活跃
问题[快手]
假设今天是 2020月08月24日 统计 连续3天活跃的用户
建表语
CREATE DATABASE IF NOT EXISTS funcdb;
USE funcdb;
DROP TABLE IF EXISTS funcdb.tbl_login_log;
CREATE TABLE funcdb.tbl_login_log (
user_id BIGINT,
login_time STRING
);
INSERT OVERWRITE TABLE funcdb.tbl_login_log
SELECT 1001 AS user_id, '2020-08-21 01:00:00' AS login_time UNION ALL
SELECT 1001 AS user_id, '2020-08-22 02:00:00' AS login_time UNION ALL
SELECT 1001 AS user_id, '2020-08-22 03:00:00' AS login_time UNION ALL
SELECT 1001 AS user_id, '2020-08-23 04:00:00' AS login_time UNION ALL
SELECT 1002 AS user_id, '2020-08-21 05:00:00' AS login_time UNION ALL
SELECT 1002 AS user_id, '2020-08-22 06:00:00' AS login_time UNION ALL
SELECT 1003 AS user_id, '2020-08-21 07:00:00' AS login_time UNION ALL
SELECT 1003 AS user_id, '2020-08-22 08:00:00' AS login_time UNION ALL
SELECT 1003 AS user_id, '2020-08-23 09:00:00' AS login_time;
SELECT * FROM funcdb.tbl_login_log WHERE SUBSTR(login_time, 1, 10) > '2020-08-20';
答案:
WITH `t1` AS (
-- 处理日期格式并去重
SELECT user_id, SUBSTR(login_time, 1, 10) AS login_dt
FROM funcdb.tbl_login_log
WHERE SUBSTR(login_time, 1, 10) >= date_sub('2020-08-24 00:00:00', 3)
GROUP BY user_id, SUBSTR(login_time, 1, 10)
),
`t2` AS (
-- 按用户分组,按登录日期排序
SELECT user_id,
row_number() OVER (PARTITION BY user_id ORDER BY login_dt) AS row_num
FROM t1
)
SELECT user_id FROM t2 WHERE row_num = 3;
03) 连续登陆最长天数
问题
统计各用户连续登陆的最长天数
建表语句
CREATE DATABASE IF NOT EXISTS funcdb;
USE funcdb;
DROP TABLE IF EXISTS funcdb.tbl_max_login;
CREATE TABLE funcdb.tbl_max_login (
user_id STRING,
login_dt STRING
);
INSERT OVERWRITE TABLE funcdb.tbl_max_login
SELECT '1001' AS user_id, '2019-07-21' AS login_dt UNION ALL
SELECT '1001' AS user_id, '2019-07-21' AS login_dt UNION ALL
SELECT '1001' AS user_id, '2019-08-01' AS login_dt UNION ALL
SELECT '1001' AS user_id, '2019-08-02' AS login_dt UNION ALL
SELECT '1001' AS user_id, '2019-08-03' AS login_dt UNION ALL
SELECT '1002' AS user_id, '2019-08-01' AS login_dt UNION ALL
SELECT '1002' AS user_id, '2019-08-02' AS login_dt UNION ALL
SELECT '1003' AS user_id, '2019-08-01' AS login_dt UNION ALL
SELECT '1003' AS user_id, '2019-08-03' AS login_dt UNION ALL
SELECT '1004' AS user_id, '2019-07-28' AS login_dt UNION ALL
SELECT '1004' AS user_id, '2019-07-29' AS login_dt UNION ALL
SELECT '1004' AS user_id, '2019-08-01' AS login_dt UNION ALL
SELECT '1004' AS user_id, '2019-08-02' AS login_dt UNION ALL
SELECT '1004' AS user_id, '2019-08-03' AS login_dt UNION ALL
SELECT '1004' AS user_id, '2019-08-04' AS login_dt;
SELECT * FROM funcdb.tbl_max_login;
答案
WITH `t1` AS (
-- 根据user_id分组,login_dt排序,得到row_num
SELECT user_id,
login_dt,
row_number() OVER (distribute BY user_id sort BY login_dt) row_num
FROM funcdb.tbl_max_login
GROUP BY user_id, login_dt
),
`t2` AS (
-- 使用 login_dt - row_num 得出从哪天开始登录的 sub_dt
-- 同一用户如果 sub_dt 相同,则这组数据是连续登录的数据
SELECT user_id,
login_dt,
DATE_SUB(login_dt, row_num) AS sub_dt
FROM t1
),
`t3` AS (
-- 根据 user_id, sub_dt 分组, sum(sub_dt) 是用户在一段时间内连续登录的天数
SELECT user_id, sub_dt, COUNT(sub_dt) AS combo_days
FROM t2
GROUP BY user_id, sub_dt
)
-- 按 user_id 分组, max(combo_days) 就是用户连续登录最长天数
SELECT user_id, MAX(combo_days) AS max_combo_days FROM t3 GROUP BY user_id;
拓展:统计各用户连续登陆最长天数的起始日期
WITH `t1` AS (
SELECT user_id,
login_dt,
row_number() over (distribute BY user_id sort BY login_dt) row_num
FROM funcdb.tbl_max_login
GROUP BY user_id, login_dt
),
`t2` AS (
SELECT user_id,
login_dt,
DATE_SUB(login_dt, row_num) AS sub_dt
FROM t1
),
`t3` AS (
SELECT user_id,
login_dt,
first_value(login_dt) over (PARTITION BY user_id, sub_dt ORDER BY login_dt) AS combo_start_dt,
first_value(login_dt) over (PARTITION BY user_id, sub_dt ORDER BY login_dt DESC) AS combo_end_dt,
sub_dt
FROM t2
),
`t4` AS (
SELECT user_id, combo_start_dt, combo_end_dt, COUNT(sub_dt) AS combo_days
FROM t3
GROUP BY user_id, combo_start_dt, combo_end_dt, sub_dt
),
`t5` AS (
SELECT user_id, combo_start_dt, combo_end_dt, combo_days,
MAX(combo_days) OVER (PARTITION BY user_id, combo_start_dt, combo_end_dt) AS max_combo_days,
row_number() OVER (PARTITION BY user_id ORDER BY combo_start_dt DESC) AS row_num
FROM t4
)
SELECT user_id, combo_start_dt, combo_end_dt,combo_days AS `fnal_max_combo_days`
FROM t5
WHERE combo_days = max_combo_days
AND row_num = 1;
04) avg开窗 + having min
问题
找出所有科目成绩都大于某一学科平均成绩的学生
建表语句
create database if not exists funcdb;
drop table if exists funcdb.tbl_score;
create table funcdb.tbl_score (
uid string,
subject_id string,
score int
);
insert into funcdb.tbl_score values ('1001', '01', 93),('1001', '02', 90),('1001', '03', 99),('1002', '01', 91),('1002', '02', 89),('1002', '03', 70),('1003', '01', 92),('1003', '02', 88),('1003', '03', 85);
答案
select t1.uid
from (select uid,
score,
subject_id,
avg(score) over (partition by subject_id) as avg_score
from tbl_score) as t1
group by t1.uid
having min(t1.score - t1.avg_score) > 0;
05) sum开窗(累计)
我们有如下的用户访问数据,要求使用 SQL 统计出每个用户的累积访问次数
数据
+----------+-------------+--------------+
| user_id | visit_date | visit_count |
+----------+-------------+--------------+
| u01 | 2017-01-21 | 5 |
| u02 | 2017-01-23 | 6 |
| u03 | 2017-01-22 | 8 |
| u04 | 2017-01-20 | 3 |
| u01 | 2017-01-23 | 6 |
| u01 | 2017-02-21 | 8 |
| u02 | 2017-01-23 | 6 |
| u02 | 2017-02-21 | 9 |
| u03 | 2017-03-22 | 8 |
| u01 | 2017-02-22 | 4 |
+----------+-------------+--------------+
结果
+---------+----------+------+-----+
| 用户id | 月份 | 小计 | 累计|
+---------+----------+------+-----+
| u01 | 2017-01 | 11 | 11 |
| u01 | 2017-02 | 12 | 23 |
| u02 | 2017-01 | 12 | 12 |
| u02 | 2017-02 | 9 | 21 |
| u03 | 2017-01 | 8 | 8 |
| u03 | 2017-03 | 8 | 16 |
| u04 | 2017-01 | 3 | 3 |
+---------+----------+------+-----+
建表语句
create database if not exists funcdb;
drop table if exists funcdb.tbl_visit_log;
create table funcdb.tbl_visit_log (
user_id string,
visit_date string,
visit_count int
);
insert into funcdb.tbl_visit_log values ('u01', '2017-01-21', 5),('u02', '2017-01-23', 6),('u03', '2017-01-22', 8),('u04', '2017-01-20', 3),('u01', '2017-01-23', 6),('u01', '2017-02-21', 8),('u02', '2017-01-23', 6),('u02', '2017-02-21', 9),('u03', '2017-03-22', 8),('u01', '2017-02-22', 4);
答案
select t2.user_id as `用户id`,
t2.ym_date as `月份`,
t2.visit_ym_count as `小计`,
sum(t2.visit_ym_count) over (partition by t2.user_id order by t2.ym_date) as `累计`
from (select t1.user_id,
t1.ym_date,
sum(t1.visit_count) as visit_ym_count
from (select user_id,
date_format(visit_date, "yyyy-MM") as ym_date,
visit_count
from tbl_visit_log) as t1
group by t1.user_id, t1.ym_date) as t2
order by t2.user_id,t2.ym_date;
06) 分组排序求topN
问题
有 50W 个京东店铺,每个顾客访客访问任何一个店铺的任何一个商品时都会产生一条访问日志,访问日志存储的表名为 tbl_jd_visit,访客的用户 id 为 user_id,被访问的店铺名称为shop,请统计:
- 每个店铺的 UV(访客数)
- 每个店铺访问次数 top3 的访客信息。输出店铺名称、访客 id、访问次数
建表语句
create database if not exists funcdb;
drop table if exists funcdb.tbl_jd_visit;
create table funcdb.tbl_jd_visit
(
user_id string,
shop string
);
insert into funcdb.tbl_jd_visit values ('u1','a'),('u2','b'),('u1','b'),('u1','a'),('u3','c'),('u4','b'),('u1','a'),('u2','c'),
('u5','b'),('u4','b'),('u6','c'),('u2','c'),('u1','b'),('u2','a'),('u2','a'),('u3','a'),('u5','a'),('u5','a'),('u5','a');
答案
-- 1. 每个店铺的 UV(访客数)
select shop as `店铺名称`,
count(shop) as `访客次数`
from funcdb.tbl_jd_visit
group by shop;
-- 2. 每个店铺访问次数 top3 的访客信息。输出店铺名称、访客 id、访问次数
select t2.shop as `店铺名称`,
t2.user_id as `访客id`,
t2.uv as `访客次数`,
t2.row_num as `排名`
from (select t1.shop,
t1.user_id,
t1.uv,
row_number() over (partition by t1.shop) as row_num
from (select shop,
user_id,
count(user_id) as uv
from funcdb.tbl_jd_visit
group by shop, user_id) as t1) as t2
where t2.row_num <= 3
order by t2.shop, t2.row_num;
07) 分组聚合
问题
已知一个订单表 tbl_order_tab
,有如下字段:dt(日期),order_id(订单id),user_id(用户id),amount(订单成交金额)。请给出 sql 进行统计:
- 给出 2017 年每个月的订单数、用户数、总成交金额。
- 给出 2017 年 11 月的新客数(指在 11 月才有第一笔订单)
- 求所有用户中在 2017年11月份 第一次购买商品的金额
表数据
+-------------+-----------+----------+---------+
| dt | order_id | user_id | amount |
+-------------+-----------+----------+---------+
| 2017-10-01 | 10029028 | 1001 | 33.57 |
| 2017-11-01 | 10029029 | 1002 | 33.57 |
| 2017-11-01 | 10029030 | 1001 | 17.20 |
| 2017-11-02 | 10029031 | 1001 | 66.54 |
| 2017-11-02 | 10029031 | 1002 | 86.32 |
| 2017-12-02 | 10029032 | 1002 | 57.03 |
| 2018-01-02 | 10029033 | 1003 | 32.35 |
+-------------+-----------+----------+---------+
结果一:
+----------+------+------+-------------+
| 月份 | 订单数| 用户数| 月总成交金额 |
+----------+------+------+-------------+
| 2017-11 | 4 | 2 | 203.63 |
| 2017-10 | 1 | 1 | 33.57 |
| 2017-12 | 1 | 1 | 57.03 |
+----------+------+------+-------------+
建表语句
create database if not exists funcdb;
drop table if exists funcdb.tbl_order_tab;
create table funcdb.tbl_order_tab (
dt string,
order_id string,
user_id string,
amount decimal(10, 2)
);
insert into funcdb.tbl_order_tab values ('2017-10-01', '10029028', '1001', 33.57),('2017-11-01', '10029029', '1002', 33.57),('2017-11-01', '10029030', '1001', 17.20),('2017-11-02', '10029031', '1001', 66.54),('2017-11-02', '10029031', '1002', 86.32),('2017-12-02', '10029032', '1002', 57.03),('2018-01-02', '10029033', '1003', 32.35);
答案
-- 1. 给出 2017 年每个月的订单数、用户数、总成交金额。
select date_format(dt, 'yyyy-MM') as `ym_date`
, count(order_id) as `订单数`
, count(distinct user_id) as `用户数`
, sum(amount) as `月总成交金额`
from funcdb.tbl_order_tab
where date_format(dt, 'yyyy') = '2017'
group by date_format(dt, 'yyyy-MM');
-- 2. 给出 2017 年 11 月的新客数(指在 11 月才有第一笔订单)
select count(distinct user_id) as `11月新客数`
from funcdb.tbl_order_tab
where date_format(dt, 'yyyy-MM') <= '2017-11'
group by user_id
having min(dt) >= '2017-11-01';
-- 11月新客第一笔订单明细
select t1.dt
, t1.order_id
, t1.user_id
, t1.amount
from (select dt
, order_id
, user_id
, amount
, row_number() over (partition by user_id order by dt) as row_num
from funcdb.tbl_order_tab
where date_format(dt, 'yyyy-MM') <= '2017-11') as t1
where date_format(t1.dt, 'yyyy-MM') = '2017-11'
and t1.row_num = 1;
-- 求所有用户中在 2017年11月份 第一次购买商品的金额
-- (30.801 seconds)
select user_id,
amount
from (select user_id,
amount,
row_number() over (distribute by user_id sort by dt) as row_num
from funcdb.tbl_order_tab
where date_format(dt, 'yyyy-MM') = '2017-11') as t1
where row_num = 1;
-- (27.229 seconds)
select t1.user_id,
t1.first_amount
from (select user_id,
first_value(amount) over (distribute by user_id sort by dt) as first_amount
from funcdb.tbl_order_tab
where date_format(dt, 'yyyy-MM') = '2017-11'
) as t1
group by t1.user_id, t1.first_amount;
08) 排序开窗
问题
有日志如下,请写出代码求得所有用户和活跃用户的总数及平均年龄。(活跃用户指连续两天都有访问记录的用户)
+-------------+----------+------+
| dt | user_id | age |
+-------------+----------+------+
| 2019-02-11 | test_1 | 23 |
| 2019-02-11 | test_2 | 19 |
| 2019-02-11 | test_3 | 39 |
| 2019-02-11 | test_1 | 23 |
| 2019-02-11 | test_3 | 39 |
| 2019-02-11 | test_1 | 23 |
| 2019-02-12 | test_2 | 19 |
| 2019-02-13 | test_1 | 23 |
| 2019-02-15 | test_2 | 19 |
| 2019-02-16 | test_2 | 19 |
+-------------+----------+------+
建表语句
create database if not exists funcdb;
drop table if exists funcdb.tbl_user_age;
create table funcdb.tbl_user_age (
dt string,
user_id string,
age int
);
insert into funcdb.tbl_user_age values ('2019-02-11', 'test_1', 23),('2019-02-11', 'test_2', 19),('2019-02-11', 'test_3', 39),('2019-02-11', 'test_1', 23),('2019-02-11', 'test_3', 39),('2019-02-11', 'test_1', 23),('2019-02-12', 'test_2', 19),('2019-02-13', 'test_1', 23),('2019-02-15', 'test_2', 19),('2019-02-16', 'test_2', 19);
答案
-- 所有用户和活跃用户的总数及平均年龄。(活跃用户指连续两天都有访问记录的用户)
select sum(mt1.all_user_count) as rs_all_user_count,
sum(mt1.all_age_avg) as rs_all_age_avg ,
sum(mt1.active_user_count) as rs_active_user_count,
sum(mt1.active_age_avg) as rs_active_age_avg
from ( -- 求活跃用户的总数及平均年龄
select 0 as `all_user_count`,
0 as `all_age_avg`,
count(t2.user_id) as `active_user_count`,
cast(avg(t2.age) as decimal(10, 1)) as `active_age_avg`
from (select t1.user_id,
t1.age
from (
select user_id,
age,
dense_rank() over (partition by user_id order by dt) as dense_num
from funcdb.tbl_user_age
) as t1
where t1.dense_num = 2
group by t1.user_id, t1.age) as t2
union all -- 求总用户的总数及平均年龄
select count(ht1.user_id) as `all_user_count`,
cast(avg(ht1.age) as decimal(10, 1)) as `all_age_avg`,
0 as `active_user_count`,
0 as `active_user_age`
from (select user_id, age
from funcdb.tbl_user_age
group by user_id, age) as ht1
) mt1;
09) 分组聚合排序求topN
问题
有一个线上服务器访问日志表tbl_access_log格式如下,求 2016年11月9日下午14-15 点
,访问 /api/user/login
接口的top10的ip_addr
+----------------------+-------------------+--------------+
| visit_time | in_path | ip_addr |
+----------------------+-------------------+--------------+
| 2016-11-09 14:22:05 | /api/user/login | 110.23.5.33 |
| 2016-11-09 11:23:10 | /api/user/detail | 57.3.2.16 |
| 2016-11-09 14:59:40 | /api/user/login | 200.6.5.166 |
| 2016-11-09 14:22:05 | /api/user/login | 110.23.5.34 |
| 2016-11-09 14:22:05 | /api/user/login | 110.23.5.34 |
| 2016-11-09 14:22:05 | /api/user/login | 110.23.5.34 |
| 2016-11-09 11:23:10 | /api/user/detail | 57.3.2.16 |
| 2016-11-09 23:59:40 | /api/user/login | 200.6.5.166 |
| 2016-11-09 14:22:05 | /api/user/login | 110.23.5.34 |
| 2016-11-09 11:23:10 | /api/user/detail | 57.3.2.16 |
| 2016-11-09 23:59:40 | /api/user/login | 200.6.5.166 |
| 2016-11-09 14:22:05 | /api/user/login | 110.23.5.35 |
| 2016-11-09 14:23:10 | /api/user/detail | 57.3.2.16 |
| 2016-11-09 23:59:40 | /api/user/login | 200.6.5.166 |
| 2016-11-09 14:59:40 | /api/user/login | 200.6.5.166 |
| 2016-11-09 14:59:40 | /api/user/login | 200.6.5.166 |
+----------------------+-------------------+--------------+
建表语句
create database if not exists funcdb;
drop table if exists funcdb.tbl_access_log;
create table funcdb.tbl_access_log
(
visit_time string,
in_path string,
ip_addr string
);
insert into funcdb.tbl_access_log values
('2016-11-09 14:22:05', '/api/user/login', '110.23.5.33'),('2016-11-09 11:23:10', '/api/user/detail', '57.3.2.16'),
('2016-11-09 14:59:40', '/api/user/login', '200.6.5.166'),('2016-11-09 14:22:05', '/api/user/login', '110.23.5.34'),
('2016-11-09 14:22:05', '/api/user/login', '110.23.5.34'),('2016-11-09 14:22:05', '/api/user/login', '110.23.5.34'),
('2016-11-09 11:23:10', '/api/user/detail', '57.3.2.16'),('2016-11-09 23:59:40', '/api/user/login', '200.6.5.166'),
('2016-11-09 14:22:05', '/api/user/login', '110.23.5.34'),('2016-11-09 11:23:10', '/api/user/detail', '57.3.2.16'),
('2016-11-09 23:59:40', '/api/user/login', '200.6.5.166'),('2016-11-09 14:22:05', '/api/user/login', '110.23.5.35'),
('2016-11-09 14:23:10', '/api/user/detail', '57.3.2.16'),('2016-11-09 23:59:40', '/api/user/login', '200.6.5.166'),
('2016-11-09 14:59:40', '/api/user/login', '200.6.5.166'),('2016-11-09 14:59:40', '/api/user/login', '200.6.5.166');
答案
-- 求 2016年 11月9号下午14点(14-15 点),访问 /api/user/login 接口的 top10 的 ip 地址
select ip_addr,
count(ip_addr) as ip_count
from funcdb.tbl_access_log
where visit_time >= '2016-11-09 14:00:00'
and visit_time < '2016-11-09 15:00:00'
and in_path = '/api/user/login'
group by ip_addr
order by ip_count desc
limit 10;
10) 相互关注用户
问题[美团]
有如下 用户关注行为表,status=1
表示 u1 用户关注了 u2 用户, status=0
表示 u1 用户取消关注了 u2 用户,求相互关注的用户有哪些?
注意:如果结果表中存在了
u1=1001
,u2=1002
,那么就不能存在u1=1002
,u2=1001
+-------+-------+----------------------+---------+
| u1 | u2 | c_time | status |
+-------+-------+----------------------+---------+
| 1001 | 1002 | 2020-01-22 10:01:00 | 1 | -- 1001用户关注了 1002 用户
| 1001 | 1002 | 2020-01-22 10:02:00 | 0 | -- 1001用户取消关注了 1002 用户
| 1001 | 1002 | 2020-01-22 10:03:00 | 1 | -- 1001用户再次关注了 1002 用户
| 1002 | 1001 | 2020-01-22 10:04:00 | 1 | -- 1002用户关注了 1001 用户
| 1001 | 1003 | 2020-01-22 10:05:00 | 1 | -- 根据时间取 u1 → u2 最新的关注状态
| 1001 | 1003 | 2020-01-22 10:06:00 | 0 | -- 再判断 两个用户间是否是互相关注的
| 1003 | 1001 | 2020-01-22 10:07:00 | 1 | -- 如果 1001 和 1002 是相互关注的,
| 1002 | 1003 | 2020-01-22 10:08:00 | 1 | -- 将这两个用户输出,但只保留一组 u1 u2
| 1003 | 1002 | 2020-01-22 10:09:00 | 1 | -- 不能再保留最新的u2 u1了
| 1004 | 1003 | 2020-01-22 10:10:00 | 1 |
| 1004 | 1001 | 2020-01-22 10:11:00 | 1 |
+-------+-------+----------------------+---------+
建表语句
create database if not exists funcdb;
drop table if exists funcdb.tbl_fans_act;
create table funcdb.tbl_fans_act (
u1 int,
u2 int,
c_time string,
status int
);
insert into funcdb.tbl_fans_act values (1001,1002, '2020-01-22 10:01:00', 1), (1001,1002, '2020-01-22 10:02:00', 0), (1001,1002, '2020-01-22 10:03:00', 1), (1002,1001, '2020-01-22 10:04:00', 1), (1001,1003, '2020-01-22 10:05:00', 1), (1001,1003, '2020-01-22 10:06:00', 0), (1003,1001, '2020-01-22 10:07:00', 1), (1002,1003, '2020-01-22 10:08:00', 1), (1003,1002, '2020-01-22 10:09:00', 1),(1004,1003, '2020-01-22 10:10:00', 1),(1004,1001, '2020-01-22 10:11:00', 1);
答案
select cast(split(t1.new_col, ',')[0] as bigint) as u1,
cast(split(t1.new_col, ',')[1] as bigint) as u2
from (select if(u1 < u2, concat(u1, ',', u2), concat(u2, ',', u1)) as new_col
, status
-- 根据 u1 u2 进行分组,按照日期排序,选择最新的关注状态
, row_number() over (partition by u1,u2 order by c_time desc) as row_num
from funcdb.tbl_fans_act) as t1
where t1.row_num = 1
group by t1.new_col
having sum(status) = 2;
11) sum(if)
问题
按日期统计每天胜负的场次
-- 表数据
+-------------+-------+
| g_date | g_rs |
+-------------+-------+
| 2005-05-09 | win |
| 2005-05-09 | lose |
| 2005-05-09 | lose |
| 2005-05-09 | lose |
| 2005-05-10 | win |
| 2005-05-10 | lose |
| 2005-05-10 | lose |
+-------------+-------+
-- 输出结果
+-------------+------+-------+
| g_date | win | lose |
+-------------+------+-------+
| 2005-05-09 | 1 | 3 |
| 2005-05-10 | 1 | 2 |
+-------------+------+-------+
建表语句
-- 建表
create database if not exists funcdb;
drop table if exists funcdb.tbl_game_rs;
create table funcdb.tbl_game_rs
(
g_date string,
g_rs string
);
insert into funcdb.tbl_game_rs values ('2005-05-09','win'),('2005-05-09','lose'),('2005-05-09','lose'),('2005-05-09','lose'),('2005-05-10','win'),('2005-05-10','lose'),('2005-05-10','lose');
答案
-- 答案
select g_date,
sum(case when g_rs = 'win' then 1 else 0 end ) as `win`,
sum(if(g_rs = 'lose',1,0)) as `lose`
from funcdb.tbl_game_rs
group by g_date;
12) join > union all
有三张表分别为会员表(t_member)销售表(t_sale)退货表(t_regoods),如下
查询需求:分组查出销售表中所有会员购买金额,同时分组查出退货表中所有会员的退货金额,把会员 id 相同的购买金额-退款金额得到的结果更新到表会员表中对应会员的积分字段(credits)
问题
-- 销售表
+-------+------------+
| m_id | buy_money |
+-------+------------+
| 1001 | 50.12 |
| 1002 | 56.75 |
| 1003 | 35.86 |
| 1001 | 23.16 |
| 1005 | 56.22 |
| NULL | 25.60 |
| NULL | 33.50 |
+-------+------------+
-- 退货表
+-------+---------------+
| m_id | return_money |
+-------+---------------+
| 1001 | 20.16 |
| 1002 | 23.63 |
| 1001 | 10.11 |
| NULL | 23.53 |
| NULL | 10.27 |
| 1005 | 0.81 |
+-------+---------------+
建表语句
create database if not exists funcdb;
drop table if exists funcdb.tbl_member;
drop table if exists funcdb.tbl_sale;
drop table if exists funcdb.tbl_regoods;
-- 会员表有字段 m_id(会员 id,主键) 积分 credits
create table funcdb.tbl_member ( m_id string, credits decimal(10, 2) );
-- 销售表有字段 m_id(会员 id,外键)购买金额 buy_money
create table funcdb.tbl_sale (m_id string,buy_money decimal(10, 2));
-- 退货表中有字段 m_id(会员 id,外键)退货金额 return_money
create table funcdb.tbl_regoods ( m_id string, return_money decimal(10, 2) );
insert into funcdb.tbl_sale values ('1001', 50.12),('1002', 56.75),('1003', 35.86),('1001', 23.16),('1005', 56.22),(null,25.6),(null,33.5);
insert into funcdb.tbl_regoods values ('1001', 20.16),('1002', 23.63),('1001', 10.11),(null,23.53),(null,10.27),('1005', 0.81);
答案
insert overwrite table funcdb.tbl_member
select combo.m_id,
sum(combo.money) as credits
from (select m_id,
buy_money as money
from funcdb.tbl_sale
where m_id is not null
union all
select m_id,
(0 - return_money) as money
from funcdb.tbl_regoods
where m_id is not null
union all
select m_id,
credits
from funcdb.tbl_member
) as combo
group by combo.m_id;
13) lead
问题
用户访问日志数据如下表,需求:
-
统计 user_id 在 每个网页 停留的起始时间
-
统计某个网页用户停留的总时间
+----------+----------------------+--------+
| user_id | in_time | v_url |
+----------+----------------------+--------+
| Peter | 2015-10-12 01:10:00 | url1 |
| Peter | 2015-10-12 01:15:10 | url2 |
| Peter | 2015-10-12 01:16:40 | url3 |
| Peter | 2015-10-12 02:13:00 | url4 |
| Peter | 2015-10-12 03:14:30 | url5 |
| Marry | 2015-11-12 01:10:00 | url1 |
| Marry | 2015-11-12 01:15:10 | url2 |
| Marry | 2015-11-12 01:16:40 | url3 |
| Marry | 2015-11-12 02:13:00 | url4 |
| Marry | 2015-11-12 03:14:30 | url5 |
+----------+----------------------+--------+
建表语句
create database if not exists funcdb;
drop table if exists funcdb.tbl_user_log;
create table funcdb.tbl_user_log (
user_id string,
in_time string,
v_url string
);
insert into funcdb.tbl_user_log values
('Peter','2015-10-12 01:10:00','url1'),
('Peter','2015-10-12 01:15:10','url2'),
('Peter','2015-10-12 01:16:40','url3'),
('Peter','2015-10-12 02:13:00','url4'),
('Peter','2015-10-12 03:14:30','url5'),
('Marry','2015-11-12 01:10:00','url1'),
('Marry','2015-11-12 01:15:10','url2'),
('Marry','2015-11-12 01:16:40','url3'),
('Marry','2015-11-12 02:13:00','url4'),
('Marry','2015-11-12 03:14:30','url5');
答案
-- 1. 统计 user_id 在 每个网页 停留的起始时间
select user_id
, in_time as s_time
, lead(in_time) over (partition by user_id order by in_time) as e_time
, v_url
from funcdb.tbl_user_log;
+----------+----------------------+----------------------+--------+
| user_id | s_time | e_time | v_url |
+----------+----------------------+----------------------+--------+
| Marry | 2015-11-12 01:10:00 | 2015-11-12 01:15:10 | url1 |
| Marry | 2015-11-12 01:15:10 | 2015-11-12 01:16:40 | url2 |
| Marry | 2015-11-12 01:16:40 | 2015-11-12 02:13:00 | url3 |
| Marry | 2015-11-12 02:13:00 | 2015-11-12 03:14:30 | url4 |
| Marry | 2015-11-12 03:14:30 | NULL | url5 |
| Peter | 2015-10-12 01:10:00 | 2015-10-12 01:15:10 | url1 |
| Peter | 2015-10-12 01:15:10 | 2015-10-12 01:16:40 | url2 |
| Peter | 2015-10-12 01:16:40 | 2015-10-12 02:13:00 | url3 |
| Peter | 2015-10-12 02:13:00 | 2015-10-12 03:14:30 | url4 |
| Peter | 2015-10-12 03:14:30 | NULL | url5 |
+----------+----------------------+----------------------+--------+
-- 2. 统计某个网页用户停留的总时间
select user_id,
in_time as s_time,
lead(in_time) over (partition by user_id order by in_time) e_time,
UNIX_TIMESTAMP(lead(in_time) over (partition by user_id order by in_time), 'yyyy-MM-dd HH:mm:ss') -
UNIX_TIMESTAMP(in_time, 'yyyy-MM-dd HH:mm:ss') period,
v_url
from funcdb.tbl_user_log;
14) grouping sets
问题[[美团:介绍一下 grouping sets 的用法]
查出每个地区,每个学校,每个班级各自的总人数
+------------+--------------+----------+---------------+-----------+
| region_id | region_name | school | class | stu_name |
+------------+--------------+----------+---------------+-----------+
| 1 | 宝安 | 宝安中学 | 王者峡谷1班 | 刘备 |
| 1 | 宝安 | 宝安中学 | 王者峡谷1班 | 诸葛亮 |
| 1 | 宝安 | 宝安中学 | 王者峡谷1班 | 甄姬 |
| 1 | 宝安 | 宝安小学 | 大数据1班 | 蔡文姬 |
| 1 | 宝安 | 宝安小学 | 大数据1班 | 李四 |
| 1 | 宝安 | 宝安小学 | 大数据1班 | 王五 |
| 1 | 宝安 | 宝安小学 | 大数据1班 | 张三 |
| 3 | 罗湖 | 罗湖大学 | 召唤者峡谷1班 | 陈冠希 |
| 3 | 罗湖 | 罗湖大学 | 召唤者峡谷1班 | 渣渣辉 |
| 3 | 罗湖 | 罗湖大学 | 召唤者峡谷1班 | 古天乐 |
| 3 | 罗湖 | 罗湖大学 | 召唤者峡谷1班 | 刘德华 |
| 2 | 龙华 | 龙华小学 | 王者峡谷2班 | 关羽 |
| 2 | 龙华 | 龙华小学 | 王者峡谷2班 | 盾山 |
| 2 | 龙华 | 龙华小学 | 王者峡谷2班 | 猴子 |
| 2 | 龙华 | 龙华小学 | 王者峡谷2班 | 亚瑟 |
| 2 | 龙华 | 龙华小学 | 王者峡谷2班 | 二师兄 |
+------------+--------------+----------+---------------+-----------+
建表语句
create database if not exists funcdb;
drop table if exists funcdb.tbl_student;
create table funcdb.tbl_student (
region_id int,
region_name string,
school string,
class string,
stu_name string
);
insert into funcdb.tbl_student values (1, '宝安', '宝安中学', '王者峡谷1班', '刘备'),(1, '宝安', '宝安中学', '王者峡谷1班', '诸葛亮'),(1, '宝安', '宝安中学', '王者峡谷1班', '甄姬'),(1, '宝安', '宝安小学', '大数据1班', '蔡文姬'),(1, '宝安', '宝安小学', '大数据1班', '李四'),(1, '宝安', '宝安小学', '大数据1班', '王五'),(1, '宝安', '宝安小学', '大数据1班', '张三'),(3, '罗湖', '罗湖大学', '召唤者峡谷1班', '陈冠希'),(3, '罗湖', '罗湖大学', '召唤者峡谷1班', '渣渣辉'),(3, '罗湖', '罗湖大学', '召唤者峡谷1班', '古天乐'),(3, '罗湖', '罗湖大学', '召唤者峡谷1班', '刘德华'),(2, '龙华', '龙华小学', '王者峡谷2班', '关羽'),(2, '龙华', '龙华小学', '王者峡谷2班', '盾山'),(2, '龙华', '龙华小学', '王者峡谷2班', '猴子'),(2, '龙华', '龙华小学', '王者峡谷2班', '亚瑟'),(2, '龙华', '龙华小学', '王者峡谷2班', '二师兄');
答案
-- 查出每个地区,每个学校,每个班级各自的总人数
-- 如果使用 group by
select region_id, null school, null class, count(*) total from funcdb.tbl_student group by region_id
union
select null region_id, school, null class, count(*) total from funcdb.tbl_student group by school
union
select null region, null school, class, count(*) total from funcdb.tbl_student group by class;
-- 使用 grouping sets 简化
select region_id,
school,
class,
count(*) total
from tbl_student
group by region_id, school, class
grouping sets (( region_id), ( school), ( class));
group sets相当于多个group by 和union的结合使用
15) collect_set > left semi join
问题[美团:1.用最高效的SQL写出即访问过a又访问过b的用户 2.介绍一下 left semi join 的用法]
即访问过a又访问过b的用户
+------+------+
| uid | url |
+------+------+
| 1 | a |
| 2 | b |
| 1 | b |
| 3 | c |
| 3 | a |
| 1 | a |
| 1 | b |
+------+------+
建表语句
create database if not exists funcdb;
drop table if exists funcdb.tbl_user_visit;
create table funcdb.tbl_user_visit (
uid int,
url string
);
insert into funcdb.tbl_user_visit values (1,'a'),(2,'b'),(1,'b'),(3,'c'),(3,'a'),(1,'a'),(1,'b');
select uid, url from funcdb.tbl_user_visit;
答案
-- 8秒
select
uid
from funcdb.tbl_user_visit
group by uid
having array_contains(collect_set(url),'a') and array_contains(collect_set(url),'b');
-- 31秒
select t1.uid
from (select distinct uid,url from funcdb.tbl_user_visit where url = 'a') t1
left semi join
funcdb.tbl_user_visit t2
on t1.uid = t2.uid and t2.url = 'b';
-- 31秒
select distinct t1.uid
from funcdb.tbl_user_visit t1
left semi join
funcdb.tbl_user_visit t2
on t1.uid = t2.uid and t1.url = 'a' and t2.url = 'b';
left semi join 特点:
- 右表只能在 ON 子句中设置过滤条件,在 WHERE 子句、SELECT 子句或其他地方过滤都不行;
- left semi join 是只传递表的 join key 给 map 阶段,因此,最后 select 的结果只许出现左表。
- 遇到右表重复记录,左表会跳过,而 join 则会一直遍历。在右表有重复值得情况下 left semi join 只产生一条,join 会产生多条,所以 left semi join 的性能更高。
参阅:https://blog.csdn.net/happyrocking/article/details/79885071
16) rows between
问题[金融风控实战:连续 5 笔交易是 10 的整数倍的用户有参与赌博的风险]
统计连续 5 次交易额都是 10 的整数倍的用户
建表语句
create database if not exists funcdb;
drop table if exists funcdb.tbl_bill;
create table funcdb.tbl_bill (
u_name string,
price decimal(5, 2),
c_time timestamp
);
insert into funcdb.tbl_bill values ('jack', 20.13, '2021-01-25 11:00:00'),('jack', 30.42, '2021-01-25 11:01:00'),('jack', 15.35, '2021-01-25 11:02:00'),('jack', 23.63, '2021-01-25 11:03:00'),('jack', 40.00, '2021-01-25 11:04:00'),('jack', 30.00, '2021-01-25 11:05:00'),('jack', 20.00, '2021-01-25 11:06:00'),('jack', 50.00, '2021-01-25 11:07:00'),('jack', 70.00, '2021-01-25 11:08:00'),('jack', 90.00, '2021-01-25 11:09:00'),('rose', 10.00, '2021-01-25 11:10:00'),('rose', 20.00, '2021-01-25 11:11:00'),('rose', 30.00, '2021-01-25 11:12:00'),('rose', 40.00, '2021-01-25 11:13:00'),('rose', 77.32, '2021-01-25 11:14:00'),('rose', 60.00, '2021-01-25 11:15:00'),('niki', 33.54, '2021-01-25 11:16:00'),('niki', 50.00, '2021-01-25 11:17:00'),('niki', 60.00, '2021-01-25 11:18:00'),('niki', 40.00, '2021-01-25 11:19:00'),('niki', 40.00, '2021-01-25 11:20:00'),('niki', 40.00, '2021-01-25 11:21:00'),('niki', 25.21, '2021-01-25 11:22:00'),('niki', 35.43, '2021-01-25 11:23:00');
答案
select t2.u_name
from (
select t1.u_name,
t1.flag,
sum(t1.flag)
over ( distribute by t1.u_name sort by t1.c_time rows between 4 preceding and current row ) as rs_col
from (select u_name,
if(price % 10 == 0, 1, 0) as flag,
price,
c_time
from funcdb.tbl_bill) as t1) as t2
where t2.rs_col = 5
group by t2.u_name;
17) 行转列
问题:把星座和血型一样的人归类到一起输出如下结果
+---------+-------+--------+
| p_name | xz | b_type |
+---------+-------+--------+
| 孙悟空 | 白羊座 | A |
| 老王 | 射手座 | A |
| 宋宋 | 白羊座 | B |
| 猪八戒 | 白羊座 | A |
| 凤姐 | 射手座 | A |
+---------+-------+--------+
-- 输出结果
+------------+-------------+
| 星座和血型 | 姓名 |
+------------+-------------+
| 白羊座,A | 孙悟空|猪八戒 |
| 白羊座,B | 宋宋 |
| 射手座,A | 老王|凤姐 |
+------------+-------------+
建表语句
create database if not exists funcdb;
drop table if exists funcdb.tbl_person;
create table funcdb.tbl_person (
p_name string,
xz string,
b_type string
);
insert into funcdb.tbl_person values ('孙悟空', '白羊座', 'A'),('老王', '射手座', 'A'),('宋宋', '白羊座', 'B'),('猪八戒', '白羊座', 'A'),('凤姐', '射手座', 'A');
答案
select t1.xz_bt as `星座和血型`,
concat_ws('|', collect_set(t1.p_name)) as `姓名`
from (select p_name,
concat(xz, ',', b_type) as xz_bt
from funcdb.tbl_person) as t1
group by t1.xz_bt;
18) 列转行lateral view explode
问题:将电影分类中的数组数据展开
+--------------+------------------------+
| movie | category |
+--------------+------------------------+
| 《疑犯追踪》 | 悬疑,动作,科幻,剧情 |
| 《Lie to me》| 悬疑,警匪,动作,心理,剧情 |
| 《战狼2》 | 战争,动作,灾难 |
+--------------+------------------------+
-- 输出结果
+-------------------+----------+
| 电影名称 | 电影类型 |
+-------------------+----------+
| 《疑犯追踪》 | 悬疑 |
| 《疑犯追踪》 | 动作 |
| 《疑犯追踪》 | 科幻 |
| 《疑犯追踪》 | 剧情 |
| 《Lie to me》 | 悬疑 |
| 《Lie to me》 | 警匪 |
| 《Lie to me》 | 动作 |
| 《Lie to me》 | 心理 |
| 《Lie to me》 | 剧情 |
| 《战狼2》 | 战争 |
| 《战狼2》 | 动作 |
| 《战狼2》 | 灾难 |
+-------------------+----------+
建表语句
create database if not exists funcdb;
drop table if exists funcdb.tbl_movie;
create table funcdb.tbl_movie (
movie string,
category string
);
insert into funcdb.tbl_movie values ('《疑犯追踪》', '悬疑,动作,科幻,剧情'),('《Lie to me》', '悬疑,警匪,动作,心理,剧情'),('《战狼2》', '战争,动作,灾难');
答案
select tbl_movie.movie as `电影名称`,
v_tbl.new_type as `电影类型`
from funcdb.tbl_movie
lateral view explode(split(category, ',')) v_tbl as new_type;
-- 统计各电影类型的电影分别有多少部
select t1.new_type as `电影类型`,
count(t1.movie) as `数量`
from (select tbl_movie.movie,
v_tbl.new_type
from funcdb.tbl_movie
lateral view explode(split(category, ',')) v_tbl as new_type) as t1
group by t1.new_type;
19) cume_dist、percent_rank
了解:
cume_dist
:小于等于当前值的行数/分组内总行数percent_rank
:分组内当前行的排序值-1 / 分组内总行数-1
问题:统计小于等于 当前行薪水值 的人数,所占总人数的比例,占当前部门总人数的比例
+----------+----------+---------+
| dept_id | user_id | salary |
+----------+----------+---------+
| d1 | user1 | 1000 |
| d1 | user2 | 2000 |
| d1 | user3 | 3000 |
| d2 | user4 | 4000 |
| d2 | user5 | 5000 |
+----------+----------+---------+
建表语句
create database if not exists funcdb;
drop table if exists funcdb.tbl_salary;
create table funcdb.tbl_salary (
dept_id string,
user_id string,
salary int
);
insert into funcdb.tbl_salary values ('d1','user1',1000),('d1','user2',2000),('d1','user3',3000),('d2','user4',4000),('d2','user5',5000);
答案
-- 统计小于等于 当前行薪水值 的人数,所占总人数的比例,占当前部门总人数的比例
select
dept_id,
user_id,
salary,
cume_dist() over (order by salary) as ratio_all,
cume_dist() over (partition by dept_id order by salary) as ratio_group
from funcdb.tbl_salary;
select
dept_id,
user_id,
salary,
percent_rank() over (order by salary) as rn1, --分组内
rank() over (order by salary) as rn11, --分组内的rank值
sum(1) over (partition by null) as rn12, --分组内总行数
percent_rank() over (partition by dept_id order by salary) as rn2,
rank() over (partition by dept_id order by salary) as rn21,
sum(1) over (partition by dept_id) as rn22
from funcdb.tbl_salary;
扩展:MySQL练习题
01) having min
查询出每门课都大于 80 分的学生姓名
-- 表数据
+-----------+---------------+--------+
| s_name | subject_name | score |
+-----------+---------------+--------+
| 张三 | 语文 | 81 |
| 张三 | 数学 | 75 |
| 李四 | 语文 | 76 |
| 李四 | 数学 | 90 |
| 王五 | 语文 | 81 |
| 王五 | 数学 | 100 |
| 王五 | 英语 | 90 |
+-----------+---------------+--------+
-- 建表
CREATE DATABASE IF NOT EXISTS funcdb;
DROP TABLE IF EXISTS funcdb.t_score;
CREATE TABLE funcdb.t_score (s_name VARCHAR(50),subject_name VARCHAR(50),score INT);
INSERT INTO funcdb.t_score VALUES ('张三', '语文', 81 ),('张三', '数学', 75 ),('李四', '语文', 76 ),('李四', '数学', 90 ),('王五', '语文', 81 ),('王五', '数学', 100),('王五', '英语', 90 );
-- 答案
SELECT s_name FROM funcdb.t_score GROUP BY s_name HAVING MIN(score) > 80;
02) 删除表中重复记录
删除如下 学生表 中除了 id 不同, 其他都相同的学生冗余信息
+-----+----------+---------+-------+---------+--------+
| id | s_id | s_name | c_id | c_name | score |
+-----+----------+---------+-------+---------+--------+
| 1 | 2005001 | 张三 | 0001 | 数学 | 69 |
| 2 | 2005002 | 李四 | 0001 | 数学 | 89 |
| 3 | 2005001 | 张三 | 0001 | 数学 | 69 |
+-----+----------+---------+-------+---------+--------+
-- 建表
CREATE DATABASE IF NOT EXISTS funcdb;
DROP TABLE IF EXISTS funcdb.t_student;
CREATE TABLE funcdb.t_student (id INT,s_id VARCHAR(50),s_name VARCHAR(50),c_id VARCHAR(50),c_name VARCHAR(50),score INT);
-- CREATE TABLE funcdb.t_student (id INT,s_id STRING,s_name STRING,c_id STRING,c_name STRING,score INT); -- hive建表
INSERT INTO funcdb.t_student VALUES (1,'2005001', '张三','0001','数学',69 ),(2,'2005002', '李四','0001','数学',89 ),(3,'2005001', '张三','0001','数学',69 );
-- mysql高版本支持
DELETE FROM funcdb.t_student WHERE id NOT IN (SELECT MIN(id) FROM funcdb.t_student GROUP BY s_id,s_name,c_id,c_name,score);
03) 球队比赛组合
一个叫 t_team 的表,里面只有一个字段 t_id,一共有 4 条纪录,分别是 a,b,c,d,对应四个球队,现在四个球队进行比赛,用一条 sql 语句显示所有可能的比赛组合
CREATE DATABASE IF NOT EXISTS funcdb;
DROP TABLE IF EXISTS funcdb.t_team;
CREATE TABLE funcdb.t_team (t_id VARCHAR(50));
INSERT INTO funcdb.t_team VALUES ('a'),('b'),('c'),('d');
SELECT t1.t_id, t2.t_id
FROM funcdb.t_team AS t1
JOIN
funcdb.t_team AS t2
ON t1.t_id < t2.t_id;
04) 当前行作为子查询条件
怎么把如下 月销售额表 查成下面的结果表
-- 原表
+-----------+------------+---------+
| year_num | month_num | amount |
+-----------+------------+---------+
| 1991 | 1 | 1.10 |
| 1991 | 2 | 1.20 |
| 1991 | 3 | 1.30 |
| 1991 | 4 | 1.40 |
| 1992 | 1 | 2.10 |
| 1992 | 2 | 2.20 |
| 1992 | 3 | 2.30 |
| 1992 | 4 | 2.40 |
+-----------+------------+---------+
-- 结果表
+----------+------+------+------+------+
| year_num | m1 | m2 | m3 | m4 |
+----------+------+------+------+------+
| 1991 | 1.10 | 1.20 | 1.30 | 1.40 |
| 1992 | 2.10 | 2.20 | 2.30 | 2.40 |
+----------+------+------+------+------+
-- mysql 建表
CREATE DATABASE IF NOT EXISTS funcdb;
DROP TABLE IF EXISTS funcdb.t_month_sale;
CREATE TABLE funcdb.t_month_sale
(
year_num VARCHAR(20),
month_num VARCHAR(20),
amount DECIMAL(10, 2)
);
INSERT INTO funcdb.t_month_sale VALUES ('1991','1',1.1),('1991','2',1.2),('1991','3',1.3),('1991','4',1.4),('1992','1',2.1),('1992','2',2.2),('1992','3',2.3),('1992','4',2.4);
-- 答案
SELECT out_tbl.year_num,
(SELECT amount FROM funcdb.t_month_sale AS in_tbl WHERE in_tbl.month_num = '1' AND in_tbl.year_num = out_tbl.year_num) AS m1,
(SELECT amount FROM funcdb.t_month_sale AS in_tbl WHERE in_tbl.month_num = '2' AND in_tbl.year_num = out_tbl.year_num) AS m2,
(SELECT amount FROM funcdb.t_month_sale AS in_tbl WHERE in_tbl.month_num = '3' AND in_tbl.year_num = out_tbl.year_num) AS m3,
(SELECT amount FROM funcdb.t_month_sale AS in_tbl WHERE in_tbl.month_num = '4' AND in_tbl.year_num = out_tbl.year_num) AS m4
FROM funcdb.t_month_sale AS out_tbl
GROUP BY year_num ;
3.4.1 如何分析用户活跃?
在启动日志中统计不同设备 id 出现次数。
3.4.2 如何分析用户新增?vivo
用活跃用户表 left join 用户新增表,用户新增表中 mid 为空的即为用户新增。