文章目录
项目地址
- 项目作者:Ankit Bansal
- 项目一地址:
- 项目二地址:
https://www.youtube.com/watch?v=iDBznYIMuw4&list=LL&index=1
一、SQL Complex
1. 计算朋友的得分大于100的人(Complex SQL 6)
- 建表语句
drop table friend
Create table friend (pid int, fid int)
insert into friend (pid , fid ) values ('1','2');
insert into friend (pid , fid ) values ('1','3');
insert into friend (pid , fid ) values ('2','1');
insert into friend (pid , fid ) values ('2','3');
insert into friend (pid , fid ) values ('3','5');
insert into friend (pid , fid ) values ('4','2');
insert into friend (pid , fid ) values ('4','3');
insert into friend (pid , fid ) values ('4','5');
drop table person
create table person (PersonID int, Name varchar(50), Score int)
insert into person(PersonID,Name ,Score) values('1','Alice','88')
insert into person(PersonID,Name ,Score) values('2','Bob','11')
insert into person(PersonID,Name ,Score) values('3','Devis','27')
insert into person(PersonID,Name ,Score) values('4','Tara','45')
insert into person(PersonID,Name ,Score) values('5','John','63')
select * from person
select * from friend
- 我的sql
select f.pid,sum(p.Score) total_score from person p
inner join friend f on p.PersonID =f.fid
group by f.pid having sum(p.Score) >100
- 结果
2. 计算没有被ban的司机或者用户的行程的取消率(Complex SQL 7)
- 建表
Create table Trips (id int, client_id int, driver_id int, city_id int, status varchar(50), request_at varchar(50));
Create table Users (users_id int, banned varchar(50), role varchar(50));
Truncate table Trips;
insert into Trips (id, client_id, driver_id, city_id, status, request_at) values ('1', '1', '10', '1', 'completed', '2013-10-01');
insert into Trips (id, client_id, driver_id, city_id, status, request_at) values ('2', '2', '11', '1', 'cancelled_by_driver', '2013-10-01');
insert into Trips (id, client_id, driver_id, city_id, status, request_at) values ('3', '3', '12', '6', 'completed', '2013-10-01');
insert into Trips (id, client_id, driver_id, city_id, status, request_at) values ('4', '4', '13', '6', 'cancelled_by_client', '2013-10-01');
insert into Trips (id, client_id, driver_id, city_id, status, request_at) values ('5', '1', '10', '1', 'completed', '2013-10-02');
insert into Trips (id, client_id, driver_id, city_id, status, request_at) values ('6', '2', '11', '6', 'completed', '2013-10-02');
insert into Trips (id, client_id, driver_id, city_id, status, request_at) values ('7', '3', '12', '6', 'completed', '2013-10-02');
insert into Trips (id, client_id, driver_id, city_id, status, request_at) values ('8', '2', '12', '12', 'completed', '2013-10-03');
insert into Trips (id, client_id, driver_id, city_id, status, request_at) values ('9', '3', '10', '12', 'completed', '2013-10-03');
insert into Trips (id, client_id, driver_id, city_id, status, request_at) values ('10', '4', '13', '12', 'cancelled_by_driver', '2013-10-03');
Truncate table Users;
insert into Users (users_id, banned, role) values ('1', 'No', 'client');
insert into Users (users_id, banned, role) values ('2', 'Yes', 'client');
insert into Users (users_id, banned, role) values ('3', 'No', 'client');
insert into Users (users_id, banned, role) values ('4', 'No', 'client');
insert into Users (users_id, banned, role) values ('10', 'No', 'driver');
insert into Users (users_id, banned, role) values ('11', 'No', 'driver');
insert into Users (users_id, banned, role) values ('12', 'No', 'driver');
insert into Users (users_id, banned, role) values ('13', 'No', 'driver');
- 解答
with unbaned_table as (
select t.* from Trips t
inner join Users u on t.client_id = u.users_id
inner join Users u2 on t.driver_id = u2.users_id
where u.banned = 'No' and u2.banned = 'No'
),
status_talbe as(
select status,COUNT(status) status_num from unbaned_table group by status)
select
CAST(sum(case when status != 'completed' then status_num end) AS float)/ CAST(sum(status_num) AS float)
from status_talbe
3. 找出每组的最高得分和该player_id(Complex SQL 8)
- 建表
create table players
(player_id int,
group_id int)
insert into players values (15,1);
insert into players values (25,1);
insert into players values (30,1);
insert into players values (45,1);
insert into players values (10,2);
insert into players values (35,2);
insert into players values (50,2);
insert into players values (20,3);
insert into players values (40,3);
create table matches
(
match_id int,
first_player int,
second_player int,
first_score int,
second_score int)
insert into matches values (1,15,45,3,0);
insert into matches values (2,30,25,1,2);
insert into matches values (3,30,15,2,0);
insert into matches values (4,40,20,5,2);
insert into matches values (5,35,50,1,1);
精选播放列表
- 我写的sql,只能显示出最大分数,但是无法显示出最大分数的的player_id
with all_score as(
select first_player as player, first_score score from matches
union all
select second_player as player, second_score score from matches
),
final as(
select player as player_id,SUM(score) score from all_score group by player)
,
final2 as (
select p.player_id as pid,p.group_id as gid ,f.score as f_score from players p
inner join final f on p.player_id = f.player_id
)
select gid,max(f_score) as max_score, min(f_score) as min_score from final2
group by gid
- 修改,在max的分组的问题中,找单独的最高的一条数据的id,可以使用rank()来处理
with all_score as(
select first_player as player, first_score score from matches
union all
select second_player as player, second_score score from matches
),
final as(
select player as player_id,SUM(score) score from all_score group by player)
,
final2 as (
select p.player_id as pid,p.group_id as gid ,f.score as f_score from players p
inner join final f on p.player_id = f.player_id
),
final_rank as(
select *,rank() over(partition by gid order by f_score desc,pid asc) as rn from final2)
select * from final_rank where rn =1
4.计算连续成功天数(10)
- 找到连续成功或者失败的天数,并且只显示连续成功的起始日期和最后的日期。
- 难点:判断日期是否连续
- 建表
create table tasks (
date_value date,
state varchar(10)
);
insert into tasks values ('2019-01-01','success'),('2019-01-02','success'),('2019-01-03','success'),('2019-01-04','fail')
,('2019-01-05','fail'),('2019-01-06','success')
- 答案
with final as(
select *,
row_number() over(partition by state order by date_value) as rn,
dateadd(day,-1* row_number() over(partition by state order by date_value),date_value) as group_date
from tasks
order by date_value)
select group_date,state,min(date_value) as begin_date,max(date_value) as end_date from final group by group_date,state
order by group_date
5. 平台购买记录(11)
- 找到每天3个平台的消费人数
- 难点:both平台是没有的,需要自己构建
- 建表
create table spending
(
user_id int,
spend_date date,
platform varchar(10),
amount int
);
insert into spending values(1,'2019-07-01','mobile',100),(1,'2019-07-01','desktop',100),(2,'2019-07-01','mobile',100)
,(2,'2019-07-02','mobile',100),(3,'2019-07-01','desktop',100),(3,'2019-07-02','desktop',100);
二、SQL for Data Analysis in 2 hours
- 建表语句
https://github.com/CXTV/ankit_sql/blob/main/sql_for_data_analysis_in_2_hours.sql
2.1 排序
- 先根据
order_data
进行正序,如果order_data
里有相同的日期,在根据sales
逆序
select * from orders_data
order by order_date,sales desc
- 可以看出来表中
order_data
是从小到大的,当遇到相同的日期,例如2019-06-09
则根据sales
字段,从大到小,再次进行了排序
执行优先级:from > select > order by > top
2.2 创建一个新的列
select
*,
profit/sales as ratio
from orders_data
2.3 聚合
- 如果我们需要对原表的数据一行一行的过滤则使用
where
- 如果对aggregate的数据进行过滤,由于不是一行一行的数据,所以只能使用
having
- 计算出所有城市的销量和
select city ,sum(sales) as total_sales
from orders_data
group by city
2. 如果我们想计算出sum(sales)大于500的城市有哪些,我们就不能使用where,因为这是我们aggregate之后的新的查询,所以只能使用having
select city ,sum(sales) as total_sales
from orders_data
group by city
having sum(sales) >500
- 如果我们想筛选出city=west的城市的sum(sales),需要在原表里,先一行一行的将城市在west的数据筛选出来,在进行下一步筛选,
select city ,sum(sales) as total_sales
from orders_data
where region = 'West'
group by city
- having只能筛选aggregate之后,新表里存在的数据,例如,上面,新表只有两行数据,city和sum(sales)
select city ,sum(sales) as total_sales
from orders_data
group by city
having city = 'Chicago'
- order by
select city ,sum(sales) as total_sales
from orders_data
group by city
order by total_sales
执行顺序:from > where > group by > having > select > order by > top
- 这也是为什么 ② 中,
having sum(sales) >500
不能使用上面select 的sum(sales) as total_sales
的原因,而⑤中可以直接使用order by total_sales