第6章子查询、表表达式和排名函数_子查询实现排名查询-CSDN博客

本文链接：https://blog.csdn.net/qq_20227495/article/details/85087473

6.1 子查询
6.1.1 独立子查询
关系除法，返回每个美国雇员至少为其处理过一个订单的所有客户，解法1：
select custid
from Sales.Orders
where empid in (select empid from HR.Employees where country =‘USA’)
group by custid
having count(distinct empid)=(select count(empid) from HR.Employees where country =‘USA’)

返回每个月最后实际订单日期发生的订单
select orderid,custid,empid,orderdate,year(orderdate) as orderyear,month(orderdate) as ordermonth
from sales.Orders
where orderdate in
(select max(orderdate) from Sales.orders group by year(orderdate),month(orderdate))
order by orderdate

6.1.2 相关子查询
从每一组分离一行和决胜属性
select orderid,custid,empid,orderdate,requireddate
from Sales.Orders as a
where a.orderdate=(select max(orderdate) from Sales.Orders b where a.empid=b.empid)
and a.orderid=(select max(orderid) from Sales.Orders c where c.empid=a.empid and c.orderdate=a.orderdate)

EXISTS
–返回来自spain，但是做过订单的客户
select custid,companyname from
Sales.Customers a
where country=‘Spain’
and exists(select * from Sales.Orders b where a.custid=b.custid)

val not in list, 返回结果是not true 和not unknown

最小缺失值(Miss Value)
select case when exists (select * from dbo.T1 where keycol=1) then 1
else (select min(keycol)-1 from dbo.T1 a1 where not exists (select * from dbo.T1 a2 where a1.keycol=a2.keycol+1)) end

逆向逻辑在关系除法中的应用
正向: 返回由所有USA雇员为其处理过订单的客户
逆向: 返回没有USA雇员不为其处理过订单的客户

select Custid
from Sales.Customers c
where not exists(select * from HR.Employees e where e.country=‘USA’ and
not exists(select * from Sales.Orders o where o.empid=e.empid and o.custid=c.custid))

返回客户
对于该客户不能找到
任何雇员
对于这个雇员不能找到
任何订单
由这个客户下的
并且由这个雇员处理

6.2 表表达式
6.2.1 派生表

6.2.2 公用表达式(CTE)
CTE可以多引用，以下为例，可以比较每一年和上一年的数量差异
with YearlyCount
as
(
select year(orderdate) as orderyear,
count(distinct custid) as numcounts
from Sales.Orders
group by year(orderdate)
)
select cur.orderyear,cur.numcounts as curnumcounts,
pre.numcounts as prenumcounts,
cur.numcounts-pre.numcounts as growth
from YearlyCount cur
left join YearlyCount pre on cur.orderyear=pre.orderyear+1

使用CTE修改数据
– Modifying data through CTEs
WITH CustsDupsRN AS
(
SELECT *,
ROW_NUMBER() OVER(PARTITION BY custid ORDER BY (SELECT 0)) AS rn
FROM Sales.CustomersDups
)
DELETE FROM CustsDupsRN
WHERE rn > 1;
GO

递归的CTE
–查找所有下属
with emps as
(
select empid,mgrid,firstname,lastname
from hr.Employees
where empid=5
union all
select b.empid,b.mgrid,b.firstname,b.lastname
from emps a
inner join HR.Employees b on b.mgrid=a.empid
)
select * from emps;

–查找上级汇报关系
with mgrs as
(
select empid,mgrid,firstname,lastname
from HR.Employees
where empid=9
union all
select b.empid,b.mgrid,b.firstname,b.lastname
from mgrs a
inner join HR.Employees b on a.mgrid=b.empid
)
select * from mgrs;

6.3 分析排名函数
6.3.1 row_number 方法计算行号
–order by必须要有，可以order by (select 0)
select empid,qty,
ROW_NUMBER() over(order by qty) as rownum,
ROW_NUMBER() over(order by qty,empid) as d_rownum
from dbo.Sales
–可以使用Partition by 子句进行分区，在每个分区分别计算行号
–创建最优化索引的原则是在分区列，排序列，包含列(覆盖列)
select mgrid,empid,qty,
ROW_NUMBER() over(partition by mgrid order by qty,empid) as rownum
from dbo.Sales
order by mgrid,rownum

传统方法计算行号，适用于行数较小的情况
–排序列唯一时
select a.empid,a.qty,
(select count(*) from dbo.Sales b
where a.empid>=b.empid )
from dbo.Sales a
order by a.empid;

–排序列不唯一时，带有决胜属性
select a.empid,a.qty,
(select count(*)+1 from dbo.Sales b
where a.qty>b.qty or (a.qty=b.qty and a.empid>=b.empid)) as rownum
from dbo.Sales a
order by rownum;

–分区状态下的实现
select a.mgrid,a.empid,qty,
(select count(*) from dbo.Sales b
where b.mgrid=a.mgrid and (b.qty<a.qty or (b.qty=a.qty and b.empid<=a.empid))) as rowNum
from dbo.Sales a

6.3.2 排名与密集排名
Rank和Dense_Rank
select empid,qty, RANK() over(order by qty) as rnk,
DENSE_RANK() over(order by qty) as drnk
from dbo.Sales

–子查询实现方式
select empid,qty,
(select count(*) from dbo.Sales s2 where s2.qty<s1.qty)+1 as rnk,
(select count(distinct qty) from dbo.Sales s2 where s2.qty<s1.qty)+1 as drnk
from dbo.Sales s1
order by qty;

6.3.2 组号
内建NTILE函数，该函数接受组数(要分几组)作为参数，按排序列表中指定的排序条件，将数据分组。
用行数除以组数，整除得到的值为待分的组数，前余数组没组多一个值。
with TLE as
(select empid,qty,
NTILE(3) over(order by qty,empid) as tile
from dbo.Sales s1)
select tile,min(qty) as low,max(qty) as hgh
from TLE
group by tile;

6.4 数字辅助表
–使用倍增的Insert循环创建Nums表
SET NOCOUNT ON;
USE InsideTSQL2008;
IF OBJECT_ID(‘dbo.Nums’) IS NOT NULL DROP TABLE dbo.Nums;

CREATE TABLE dbo.Nums(n INT NOT NULL PRIMARY KEY);
DECLARE @max AS INT, @rc AS INT;
SET @max = 1000000;
SET @rc = 1;

INSERT INTO Nums VALUES(1);
WHILE @rc * 2 <= @max
BEGIN
INSERT INTO dbo.Nums SELECT n + @rc FROM dbo.Nums;
SET @rc = @rc * 2;
END

INSERT INTO dbo.Nums
SELECT n + @rc FROM dbo.Nums WHERE n + @rc <= @max;
GO

–通过自定义函数生成数字辅助表
IF OBJECT_ID(‘dbo.GetNums’) is not null
DROP FUNCTION dbo.GetNums;
GO

–create function 前必须是一个批处理结束，有Go
–返回表值函数语法如下，
CREATE FUNCTION dbo.GetNums(@n as bigint) RETURNS TABLE
as
return
with
L0 as (select 1 as n union all select 1),
L1 as (select a.n from L0 as a cross join L0 as b),
L2 as (select a.n from L1 as a cross join L1 as b),
L3 as (select a.n from L2 as a cross join L2 as b),
L4 as (select a.n from L3 as a cross join L3 as b),
L5 as (select a.n from L4 as a cross join L4 as b),
Num as (select ROW_NUMBER() over(order by (select 0)) as n from L5)
select n from Num where n<=@n
go

select * from dbo.GetNums(100)

6.5 缺失范围和孤岛

6.5.1 缺失范围(间断)
缺失解决方法1: 子查询
select a.seqval+1 as startPoints,
(select min(seqval)-1 from dbo.NumSeq b
where b.seqval>a.seqval) as endPonits
from dbo.NumSeq a
where not exists(select * from dbo.NumSeq b where b.seqval=a.seqval+1)
and a.seqval<>(select max(seqval) from dbo.NumSeq)

–子查询解决取值不唯一序列的缺失问题
select a.seqval+1 as startPoints,
(select min(seqval)-1 from dbo.NumSeq b
where b.seqval>a.seqval) as endPonits
from (select distinct seqval from dbo.NumSeqDups) a
where not exists(select * from dbo.NumSeq b where b.seqval=a.seqval+1)
and a.seqval<>(select max(seqval) from dbo.NumSeq)

–子查询解决时间序列缺失问题
select DATEADD(HOUR,4,a.seqval) as startPoints,
(select dateadd(hour,-4,min(seqval)) from dbo.TempSeq b
where b.seqval>a.seqval)
from dbo.TempSeq a
where not exists(select * from dbo.TempSeq b where b.seqval=dateadd(hour,4,a.seqval))
and a.seqval<>(select max(seqval) from dbo.TempSeq )

子查询解决缺失值范围的另一个思路：
1，对于每一个当前值，匹配其下一个值，生成当前值和下一个之的值对
2，对两个值对进行减法运算，保留相差大于1的值对
3，保留下的值对，开始数加1，结束数减1

利用数字辅助表，返回缺失值列表
select n from dbo.Nums
where n between (select min(seqval) from dbo.NumSeq)
and (select max(seqval) from dbo.NumSeq)
and n not in(select seqval from dbo.NumSeq)

6.5.2 现有范围(孤岛)
解决方案1，使用子查询和排名计算
with startP as (
select seqval,ROW_NUMBER() over(order by (select 0)) as rnt
from dbo.NumSeq a
where not exists(select * from dbo.NumSeq b where b.seqval=a.seqval-1)
),
endP as (
select seqval,ROW_NUMBER() over(order by (select 0)) as rnt
from dbo.NumSeq a
where not exists(select * from dbo.NumSeq b where b.seqval=a.seqval+1)
)
select a.seqval as startPoint,b.seqval as endPoint
from startP a
inner join endP b on a.rnt=b.rnt

解决方案2 基于子查询的组标识符(分组因子)
该解决方案适用于数据较少的情况
计算分组因子可以转化成计算一组连续值的最小值或者最大值，例如{11,12,13},设法为每个数计算出这组数的最大值是13，即可作为分组因子。
计算一组连续数中最大值的原理是，返回大于等于当前值，且后面是一个间断的最小值
select a.seqval,
(select min(seqval) from dbo.NumSeq b
where b.seqval>=a.seqval and not exists(select * from dbo.NumSeq c where c.seqval=b.seqval+1))
from dbo.NumSeq a

完整解决方案如下:
with C as (select a.seqval,
(select min(seqval) from dbo.NumSeq b
where b.seqval>=a.seqval and not exists(select * from dbo.NumSeq c where c.seqval=b.seqval+1)) factor
from dbo.NumSeq a
)
select min(seqval) as startPoint, max(seqval) as endPoint
from C
group by factor

解决方案3 基于排名计算的组标识符(分组因子)
常规计算行号，用数据值减去行号，即可得到分组因子
With C as (
select seqval,seqval- ROW_NUMBER() over(order by seqval) as GroupFactor
from dbo.NumSeq )
select min(seqval) as startPoint, max(seqval) as endPoint
from C
group by GroupFactor

对于时间序列的解决办法，不应该计算两个序列之间的差值，应该从每个时间序列值中减去和行号一样多个时间单位的时间间隔。
with C as (
select seqval,dateadd(hour,-4*ROW_NUMBER() over(order by seqval) ,seqval) as GroupFactor
from dbo.TempSeq)
select min(seqval) as startPoint ,max(seqval) as endPoint
from C
group by GroupFactor