【课程简介】
SQL不仅在工作中使用频繁,在IT面试中SQL题出现频率也很高,而且题目占比越来越大,经常有同学面试挂在SQL题不会做痛失offer。本次课程将从技术使用跟面试真题两方面展开讲解,分享高效的SQL代码写法,告别拐弯抹角长篇大论轻松满足工作需求;并且将40多家大厂面试真题整理出SQL题集,把常见专题总结出简易解法,套上公式就会做。助力大家轻松掌握SQL!
【教程推荐】IT面试真题详解SQL(一)
【主讲内容】
1.怎样套上公式就会做
2.40+真实大厂原题解法
3.快如闪电高效SQL写法
【主讲教师】
金牌讲师:陈老师
前中国平安高级技术专家,有9年大数据研发经验,精通SQL授课通俗易懂。
1. SQL执行顺序
上面的SQL语句的执行顺序是: from (去加载table1 和 table2这2个表 ) -> join -> on -> where -> group by->select 后 面的普通字段,聚合函数count,sum -> having -> distinct -> order by -> limit
--举例:
select a.sex, b.city, count(1) as cnt, sum(salary) as sum1
from table1 a
join table2 b on a.id=b.id
where a.name=b.name
group by a.sex,b.city
having cnt>=2
order by a.sex,b.city
limit 10
--或者是
select distinct
a.sex, b.city, a.age
from table1 a
join table2 b on a.id=b.id
where a.name=b.name
order by a.sex,b.city
limit 10
--on 和where的先后顺序讨论
--下面用left join 各得到结果,结果不一样。
--下面可知,先执行on,再执行where
select *
from table1 a left join table2 b
on a.id=b.id
where a.name=b.name;
--下面的条数可能会比上面多。
select *
from table1 a left join table2 b
2. hive10题
先配置环境 在pycharm或datagrip或idea中配置hive数据源。也可以配置一个sparkSQL数据源,来加快速度。 如果配置hive数据源: 需要提前启动hdfs和yarn,hive的metastore,hive的hiveserver2
on a.id=b.id
and a.name=b.name;
--下面用inner join 各得到结果,结果是一样的
select *
from table1 a
join table2 b
on a.id=b.id
where a.name=b.name;
select *
from table1 a
join table2 b
on a.id=b.id
and a.name=b.name;
#启动hdfs和yarn
start-all.sh
# hive的metastore
nohup /export/server/hive/bin/hive --service metastore 2>&1 > /tmp/hivemetastore.log &
#hive的hiveserver2 #hiveserver2开启后,等过2分钟后才能生效。
nohup /export/server/hive/bin/hive --service hiveserver2 2>&1 > /tmp/hivehiveserver2.log &
如果遇到下面的问题 解决办法 如果配置SparkSQL数据源 需要提前启动hdfs,hive的metastore,Spark的Thriftserver服务。 下面是spark3集成hive3需要的jar包,如果是spark2集成hive2,则jar包不一样。
hive/conf/hive-env.sh中加入
export HADOOP_CLIENT_OPTS=" -Xmx512m" export HADOOP_HEAPSIZE=1024
改完重启hiveserver2
#启动hdfs和yarn
start-all.sh
# hive的metastore
nohup /export/server/hive/bin/hive --service metastore 2>&1 > /tmp/hivemetastore.log &
#Spark的Thriftserver服务
/export/server/spark/sbin/start-thriftserver.sh \ --hiveconf hive.server2.thrift.port=10001 \ --hiveconf hive.server2.thrift.bind.host=node1 \ --master local[*]
show databases ;
create database if not exists test_sql; use test_sql;
-- 一些语句会走 MapReduce,所以慢。 可以开启本地化执行的优化。
set hive.exec.mode.local.auto=true;-- (默认为false) --第1题:访问量统计
CREATE TABLE test_sql.test1 ( userId string, visitDate string, visitCount INT ) ROW format delimited FIELDS TERMINATED BY "\t";
INSERT overwrite TABLE test_sql.test1
VALUES
( 'u01', '2017/1/21', 5 ), ( 'u02', '2017/1/23', 6 ), ( 'u03', '2017/1/22', 8 ), ( 'u04', '2017/1/20', 3 ), ( 'u01', '2017/1/23', 6 ), ( 'u01', '2017/2/21', 8 ), ( 'u02', '2017/1/23', 6 ), ( 'u01', '2017/2/22', 4 );
select *, sum(sum1) over(partition by userid order by month1 /*rows between unbounded preceding and current row*/ ) as `累积` from
(select userid, date_format(replace(visitdate,'/','-'),'yyyy-MM') as month1, sum(visitcount) sum1
from test_sql.test1
group by userid, date_format(replace(visitdate,'/','-'),'yyyy-MM')) as t;
-- 第2题:电商场景TopK统计
CREATE TABLE test_sql.test2 ( user_id string, shop string ) ROW format delimited FIELDS TERMINATED BY '\t';
INSERT INTO TABLE test_sql.test2 VALUES
( 'u1', 'a' ), ( 'u2', 'b' ), ( 'u1', 'b' ), ( 'u1', 'a' ), ( 'u3', 'c' ), ( 'u4', 'b' ), ( 'u1', 'a' ), ( 'u2', 'c' ), ( 'u5', 'b' ), ( 'u4', 'b' ), ( 'u6', 'c' ), ( 'u2', 'c' ), ( 'u1', 'b' ), ( 'u2', 'a' ), ( 'u2', 'a' ), ( 'u3', 'a' ), ( 'u5', 'a' ), ( 'u5', 'a' ), ( 'u5', 'a' );
--(1)每个店铺的UV(访客数)
-- UV和PV -- PV是访问当前网站所有的次数
-- UV是访问当前网站的客户数(需要去重)--(2)每个店铺访问次数top3的访客信息。输出店铺名称、访客id、访问次数
select shop, count(distinct user_id) as uv
from test_sql.test2 group by shop ;
--上面的拆解来看,等价于
--distinct后可以接多个字段,表示联合去重
select shop, count(user_id) as uv
from
(select distinct shop, user_id
from test_sql.test2 ) as t
group by shop ;
--也等价于
select shop, count(user_id) as uv
from
(select shop, user_id
from test_sql.test2 group by shop, user_id) as t
group by shop ;
select * from
(select *, row_number() over (partition by shop order by cnt desc) as rn
from
(select shop,user_id,count(1) as cnt from test_sql.test2 group by shop,user_id ) as t) t2
where t2.rn3;
-- 第3题:订单量统计
CREATE TABLE test_sql.test3 ( dt string, order_id string, user_id string, amount DECIMAL ( 10, 2 ) ) ROW format delimited FIELDS TERMINATED BY '\t';
INSERT overwrite TABLE test_sql.test3 VALUES
('2017-01-01','10029028','1000003251',33.57), ('2017-01-01','10029029','1000003251',33.57), ('2017-01-01','100290288','1000003252',33.57), ('2017-02-02','10029088','1000003251',33.57), ('2017-02-02','100290281','1000003251',33.57), ('2017-02-02','100290282','1000003253',33.57), ('2017-11-02','10290282','100003253',234), ('2018-11-02','10290284','100003243',234);
-- (1)给出 2017年每个月的订单数、用户数、总成交金额。
-- (2)给出2017年11月的新客数(指在11月才有第一笔订单)
select date_format(dt,'yyyy-MM') as month1, count(distinct order_id) as cnt1, count(distinct user_id) as cnt2, sum(amount) as amt from test_sql.test3
where year(dt)=2017
group by date_format(dt,'yyyy-MM');
select count(user_id) cnt from
(select user_id, min(date_format(dt,'yyyy-MM')) min_month
from test3 group by user_id) as t where min_month='2017-11';
--统计每个月的新客户数
select min_month, count(user_id) cnt
from (select user_id, min(date_format(dt, 'yyyy-MM')) min_month from test3 group by user_id) as t
group by min_month;
-- 第4题:大数据排序统计
CREATE TABLE test_sql.test4user (user_id string,name string,age int);
CREATE TABLE test_sql.test4log (user_id string,url string);
INSERT INTO TABLE test_sql.test4user VALUES('001','u1',10), ('002','u2',15), ('003','u3',15), ('004','u4',20), ('005','u5',25), ('006','u6',35), ('007','u7',40), ('008','u8',45), ('009','u9',50), ('0010','u10',65);
INSERT INTO TABLE test_sql.test4log VALUES('001','url1'), ('002','url1'), ('003','url2'), ('004','url3'), ('005','url3'), ('006','url1'), ('007','url5'), ('008','url7'), ('009','url5'), ('0010','url1');
select * from test_sql.test4user ;
select * from test_sql.test4log ;
--有一个5000万的用户文件(user_id,name,age),
-- 一个2亿记录的用户看电影的记录文件(user_id,url),根据年龄段观看电影的次数进行排序?
--取整函数有 round,floor,ceil
select *, round(x,0) as r,--四舍五入
floor(x) as f,--向下取整
ceil(x) as c--向上取整
from
(select 15/10 as x union all
select 18/10 as x union all
select 24/10 as x union all
select 27/10 as x ) as t;
select type, sum(cnt) as sum1
from
(select *, concat(floor(age/10)*10,'-',floor(age/10)*10+10) as type
from test_sql.test4user as a
-- join前最好提前减小数据量
join (select user_id,count(url) as cnt from test_sql.test4log group by user_id) as b
on a.user_id=b.user_id) as t
group by type
order by sum(cnt) desc;
-- 第5题:活跃用户统计
CREATE TABLE test5( dt string, user_id string, age int) ROW format delimited fields terminated BY ',';
INSERT overwrite TABLE test_sql.test5 VALUES ('2019-02-11','test_1',23), ('2019-02-11','test_2',19), ('2019-02-11','test_3',39), ('2019-02-11','test_1',23), ('2019-02-11','test_3',39), ('2019-02-11','test_1',23), ('2019-02-12','test_2',19), ('2019-02-13','test_1',23), ('2019-02-15','test_2',19), ('2019-02-16','test_2',19);
select * from test_sql.test5 order by dt,user_id;
--有日志如下,请写出代码求得所有用户和活跃用户的总数及平均年龄。(活跃用户指连续两天都有访问记录的用户)
-- type 总数 平均年龄
-- '所有用户' 3 27 -- '活跃用户' 1 19
with t1 as (select distinct dt, user_id,age from test_sql.test5), t2 as (select *,row_number() over (partition by user_id order by dt) as rn from t1 ), t3 as (select *,date_sub(dt,rn) as dt2 from t2), t4 as (select dt2,user_id,age,count(1) cnt from t3 group by dt2,user_id,age), t5 as (select * from t4 where cnt>=2), t6 as (select distinct user_id,age from t5)
select '所有用户' as type, count(user_id) cnt,avg(age) as avg_age
from (select distinct user_id,age from test_sql.test5) t union all
select '活跃用户' as type, count(user_id) cnt,avg(age) as avg_age from t6;
-- 用思路2来分析连续2天登录
with t1 as (select distinct dt, user_id from test_sql.test5), t2 as (select *, date_add(dt,1) as dt2, lead(dt,1)over(partition by user_id order by dt) as dt3 from t1)
select count(distinct user_id) from t2 where dt2=dt3;
-- 第6题:电商购买金额统计实战
CREATE TABLE test_sql.test6 ( userid string, money decimal(10,2), paymenttime string, orderid string);
INSERT INTO TABLE test_sql.test6 VALUES('001',100,'2017-10-01','123'), ('001',200,'2017-10-02','124'), ('002',500,'2017-10-01','125'), ('001',100,'2017-11-01','126');
select * from test_sql.test6 order by userid,paymenttime;
--请用sql写出所有用户中在今年10月份第一次购买商品的金额,
select userid,paymenttime,money
from
(select *, row_number() over (partition by userid order by paymenttime) as rn from test_sql.test6 where date_format(paymenttime,'yyyy-MM')='2017-10' ) as t
where t.rn=1
;
-- 第7题:教育领域SQL实战
CREATE TABLE test_sql.book(book_id string, `SORT` string, book_name string, writer string, OUTPUT string, price decimal(10,2));
INSERT INTO TABLE test_sql.book VALUES
('001','TP391','信息处理','author1','机械工业出版社','20'), ('002','TP392','数据库','author12','科学出版社','15'), ('003','TP393','计算机网络','author3','机械工业出版社','29'), ('004','TP399','微机原理','author4','科学出版社','39'), ('005','C931','管理信息系统','author5','机械工业出版社','40'), ('006','C932','运筹学','author6','科学出版社','55');
CREATE TABLE test_sql.reader (reader_id string, company string, name string, sex string, grade string, addr string);
INSERT INTO TABLE test_sql.reader VALUES
('0001','阿里巴巴','jack','男','vp','addr1'), ('0002','百度','robin','男','vp','addr2'), ('0003','腾讯','tony','男','vp','addr3'), ('0004','京东','jasper','男','cfo','addr4'), ('0005','网易','zhangsan','女','ceo','addr5'), ('0006','搜狐','lisi','女','ceo','addr6');
CREATE TABLE test_sql.borrow_log(reader_id string, book_id string, borrow_date string);
INSERT INTO TABLE test_sql.borrow_log VALUES ('0001','002','2019-10-14'), ('0002','001','2019-10-13'), ('0003','005','2019-09-14'), ('0004','006','2019-08-15'), ('0005','003','2019-10-10'), ('0006','004','2019-17-13');
select * from test_sql.book;
select * from test_sql.reader;
select * from test_sql.borrow_log;
--(8)考虑到数据安全的需要,需定时将“借阅记录”中数据进行备份,请使用一条SQL语句,
-- 在备份用户bak下创建与“借阅记录”表结构完全一致的数据表BORROW_LOG_BAK. -- 井且将“借阅记录”中现有数据全部复制到BORROW_L0G_ BAK中。
create table test_sql.BORROW_LOG_BAK as select * from test_sql.borrow_log;
select * from test_sql.BORROW_LOG_BAK;
--(9)现在需要将原Oracle数据库中数据迁移至Hive仓库,
-- 请写出“图书”在Hive中的建表语句(Hive实现,提示:列分隔符|;
-- 数据表数据需要外部导入:分区分别以month_part、day_part 命名)
CREATE TABLE test_sql.book2
( book_id string, `SORT` string, book_name string, writer string, OUTPUT string, price decimal(10, 2) )partitioned by (month_part string,day_part string ) row format delimited fields terminated by '|';
--(10)Hive中有表A,现在需要将表A的月分区 201505 中 -- user_id为20000的user_dinner字段更新为bonc8920,其他用户user_dinner字段数据不变,
-- 请列出更新的方法步骤。(Hive实现,提示:Hive中无update语法,请通过其他办法进行数据更新)
--A -- user_id user_dinner part -- 20000 aaaaa 201505 -- 30000 bbbbb 201505
create table A (user_id int,user_dinner string) partitioned by (part string);
insert overwrite table A partition (part = '201505')
values (20000, 'aaaaa'), (30000, 'bbbbb'), (40000, 'ccccc');
select * from A;
--update A set user_dinner='bonc8920' where user_id=20000;
insert overwrite table A partition (part = '201505')
select user_id, 'bonc8920' as user_dinner from A where user_id=20000 and part = '201505'
union all
select user_id, user_dinner from A where user_id!=20000 and part = '201505' ;
-- 第8题:服务日志SQL统计
CREATE TABLE test_sql.test8(`date` string, interface string, ip string);
INSERT INTO TABLE test_sql.test8 VALUES
('2016-11-09 11:22:05','/api/user/login','110.23.5.23'), ('2016-11-09 11:23:10','/api/user/detail','57.3.2.16'), ('2016-11-09 23:59:40','/api/user/login','200.6.5.166'), ('2016-11-09 11:14:23','/api/user/login','136.79.47.70'), ('2016-11-09 11:15:23','/api/user/detail','94.144.143.141'), ('2016-11-09 11:16:23','/api/user/login','197.161.8.206'), ('2016-11-09 12:14:23','/api/user/detail','240.227.107.145'), ('2016-11-09 13:14:23','/api/user/login','79.130.122.205'), ('2016-11-09 14:14:23','/api/user/detail','65.228.251.189'), ('2016-11-09 14:15:23','/api/user/detail','245.23.122.44'), ('2016-11-09 14:17:23','/api/user/detail','22.74.142.137'), ('2016-11-09 14:19:23','/api/user/detail','54.93.212.87'), ('2016-11-09 14:20:23','/api/user/detail','218.15.167.248'), ('2016-11-09 14:24:23','/api/user/detail','20.117.19.75'), ('2016-11-09 15:14:23','/api/user/login','183.162.66.97'), ('2016-11-09 16:14:23','/api/user/login','108.181.245.147'), ('2016-11-09 14:17:23','/api/user/login','22.74.142.137'), ('2016-11-09 14:19:23','/api/user/login','22.74.142.137');
select * from test_sql.test8;
--求11月9号下午14点(14-15点),访问/api/user/login接口的top10的ip地址
select ip, count(1) cnt
from test_sql.test8
where date_format(`date`, 'yyyy-MM-dd HH') = '2016-11-09 14'
and interface = '/api/user/login'
group by ip
order by cnt desc limit 10
;
-- 第9题:充值日志SQL实战
CREATE TABLE test_sql.test9( dist_id string COMMENT '区组id', account string COMMENT '账号', `money` decimal(10,2) COMMENT '充值金额', create_time string COMMENT '订单时间');
INSERT INTO TABLE test_sql.test9 VALUES ('1','11',100006,'2019-01-02 13:00:01'), ('1','22',110000,'2019-01-02 13:00:02'), ('1','33',102000,'2019-01-02 13:00:03'), ('1','44',100300,'2019-01-02 13:00:04'), ('1','55',100040,'2019-01-02 13:00:05'), ('1','66',100005,'2019-01-02 13:00:06'), ('1','77',180000,'2019-01-03 13:00:07'), ('1','88',106000,'2019-01-02 13:00:08'), ('1','99',100400,'2019-01-02 13:00:09'), ('1','12',100030,'2019-01-02 13:00:10'), ('1','13',100003,'2019-01-02 13:00:20'), ('1','14',100020,'2019-01-02 13:00:30'), ('1','15',100500,'2019-01-02 13:00:40'), ('1','16',106000,'2019-01-02 13:00:50'), ('1','17',100800,'2019-01-02 13:00:59'), ('2','18',100800,'2019-01-02 13:00:11'), ('2','19',100030,'2019-01-02 13:00:12'), ('2','10',100000,'2019-01-02 13:00:13'), ('2','45',100010,'2019-01-02 13:00:14'), ('2','78',100070,'2019-01-02 13:00:15');
select * from test_sql.test9 order by dist_id , money desc;
--请写出SQL语句,查询充值日志表2019年01月02号每个区组下充值额最大的账号,要求结果:
--区组id,账号,金额,充值时间
select * from
(select *, row_number() over (partition by dist_id order by money desc) rn
from test_sql.test9 where to_date(create_time)='2019-01-02') t
where t.rn=1;
-- 第10题:电商分组TopK实战
CREATE TABLE test_sql.test10( `dist_id` string COMMENT '区组id', `account` string COMMENT '账号', `gold` int COMMENT '金币');
INSERT INTO TABLE test_sql.test10 VALUES ('1','77',18), ('1','88',106), ('1','99',10), ('1','12',13), ('1','13',14), ('1','14',25), ('1','15',36), ('1','16',12), ('1','17',158), ('2','18',12), ('2','19',44), ('2','10',66), ('2','45',80), ('2','78',98);
year month amount 1991 1 1.1 1991 2 1.2 1991 3 1.3 1991 4 1.4 1992 1 2.1 1992 2 2.2 1992 3 2.3 1992 4 2.4 year m1 m2 m3 m4 1991 1.1 1.2 1.3 1.4 1992 2.1 2.2 2.3 2.4
SQL强化面试题(pdf版)