【SQL】MaxComputer常用SQL与注意小结

MaxComputer常用SQL

1.建表

CREATE TABLE dwd_tfc_ctl_signal_phasedir (
    cust_inter_id STRING COMMENT '客户路口ID',
    inter_id STRING COMMENT '高德路口Id',
    inter_name STRING COMMENT '高德路口名称',
    phase_plan_no STRING COMMENT '方案号',
    phase_name STRING COMMENT '相位名称,相位阶段',
    cust_dir_name STRING COMMENT '通行方向,如东-直行',
    f_link_id STRING COMMENT '进口道id',
    f_dir_4_no STRING COMMENT '方向的4方位编码,详见编码表',
    f_dir_8_no STRING COMMENT '方向的8方位编码,详见编码表',
    turn_dir_no STRING COMMENT '转向,详见编码表',
    modified_date STRING COMMENT '修改时间',
    source STRING COMMENT '数据来源,SCATS或者人工录入等'
)partitioned by (data_version STRING COMMENT 'data_version',
    adcode STRING COMMENT '城市编码')
LIFECYCLE 100000;

 插入数据(注意分区表与非分区表的写法):

INSERT OVERWRITE|INTO TABLE tablename [PARTITION (partcol1=val1, partcol2=val2 ...)] [(col1,col2 ...)]
select_statement
FROM from_statement;

   【其他建表方式】:

  通过create...table...as创建:--AS翻译过来是依照

create table sale_detail_ctas1 as
select * from sale_detail;
 此处sale_detail是一张分区表,而通过 create tableas select…语句创建的表不会复制分区属性,
只会把源表的分区列作为目标表的一般列处理,

即sale_detail_ctas1是一个含有5列的非分区表。

  希望复制表结构:LIKE:翻译过来是像,就是真的像了,结构一模一样!

create table sale_detail_like like sale_detail;
此时,sale_detail_like的表结构与sale_detail完全相同。除生命周期属性外,列名、列注释以及表注释等均相同。
但sale_detail中的数据不会被复制到sale_detail_like表中。

2.CASE..WHEN

case
when (_condition1) then result1
when (_condition2) then result2
when (_condition3) then result3
...
else resultn
end

  示例:(对应编程语句中的if/else),支持两种格式的CASE...WHEN,详见:这里

select
case
when shop_name is null then 'default_region'
when shop_name like 'hang%' then 'zj_region'
end as region
from sale_detail;

  要在SQL语句中做简单字典转换时,也是通过子查询利用CASE WHEN先做一层转换的,例如性别为1 2与性别为男 女的比较:

SELECT COUNT(*)
FROM (SELECT t.identity_card
    , CASE t.gender
        
        WHEN '1' THEN ''
        ELSE ''
    END AS gender
    FROM t) a 
LEFT OUTER JOIN t b 
ON a.idcard = b.sfhm
WHERE a.gender<> b.xb;

 

3.改变时间格式——TO_CHAR

  语法:有些情况可以再给参数date再包一层to_date进行转换(常用):

to_char(TO_DATE('20190402', 'yyyymmdd'), 'yyyy-mm-dd')

 

 

 

string to_char(datetime date, string format)

  若输入为 string 类型会隐式
  转换为 datetime 类型后参与运算

  示例:

to_char('2010-12-03 00:00:00', ' 阿里金融 yyyy-mm*dd') = ' 阿里金融 2010-12*03' 
to_char('2008-07-18 00:00:00', 'yyyymmdd') = '20080718'
--获取指定格式的当前时间:
  to_char(getdate(),'yyyymmddhhmiss')

 4.查看分区

ls partitions tableName
list partitions tableName

 5.抽取T-1数据

  --抽取ODS T-1的增量数据
insert overwrite table dwd_wp_jj_road_place_delta partition (fq_day='${bizday}') 
select /*+mapjoin(t1,t2,t3,t4,t5,t6,t7,t8)*/
      t.jkdbh as jkdbh,--监控点编号(按ga408.3)
      t.mc as mc,--监控点名称
      t.sm as sm,--俗称
      t.dwdm as dwdm,--单位代码(前6位为行政区划)
      t.gxsj as gxsj,--更新时间(精确到天)
      t.js as js,--介绍
      t.bz as bz,--备注
      t.jllx as jllxdm,--记录类型(0正常,1禁用,2删除)代码
      null as jllx ,--记录类型(0正常,1禁用,2删除)
      t.jfbh as jfbh,--机房编号
      t.jkdlx as jkdlxdm,--监控点类型 0-省际 1-市际 2-其它代码
      null  as jkdlx ,--监控点类型 0-省际 1-市际 2-其它
      t.jkdjb as jkdjbdm,--监控点级别 0-一级 1-二级 2-三级代码
      null as jkdjb ,--监控点级别 0-一级 1-二级 2-三级
      t.flag as flagdm,--监控点图标状态 0-图标已存在 1-图标不存在代码
      null as flag ,--监控点图标状态 0-图标已存在 1-图标不存在
      t.jkdz as jkdztdm,--监控点故障状态 0-正常 1- 故障 2-未知代码
      null as jkdzt ,--监控点故障状态 0-正常 1- 故障 2-未知
      t.pyjx as pyjx,--拼音简写
      t.dym1 as dym1,--对应码1
      t.dym2 as dym2,--
      t.dym3 as dym3,--
      t.dym4 as dym4,--
      t.dym5 as dym5,--
      t.ddlx as ddlxdm,--01:单条道路点位 02:多条道路点位代码
      null as ddlx ,--01:单条道路点位 02:多条道路点位
      t.xzqh as xzqh,--行政区划
      t.jstz as jstz,--建设投资(1-自建 2-租用)
      t.dwlx as dwlx,--点位类别
      t.jrqgjcbk as jrqgjcbkdm,--接入全国缉查布控(0-需接入 1-不接入 2-已接入代码
      null as jrqgjcbk ,--接入全国缉查布控(0-需接入 1-不接入 2-已接入
      t.fxctq as fxctqdm,--非现场监控点停启专用 0启用 1停用代码
      null as fxctq ,--非现场监控点停启专用 0启用 1停用
      t.ssxm as ssxm,--监控点所属项目
      t.kms as kms,--千米数(监控点编号第6到9位)
      t.zms as zms,--米数(监控点编号第10到12位)
      t.pgis_jd as  pgis_jd, --pgis经度
      t.pgis_wd as  pgis_wd, --pgis纬度
      t.pgis_geohash as  pgis_geohash, --pgisgeohash
      substr(t.pgis_geohash, 1,4) as pgis_geohash4, --pgis geohash4位
      substr(t.pgis_geohash, 1,5) as pgis_geohash5, --pgis geohash5位
      substr(t.pgis_geohash, 1,6) as pgis_geohash6, --pgis geohash6位
      t.gd_jd as  gd_jd, --gd经度
      t.gd_wd as  gd_wd, --gd纬度
      dwd_sp_dev:geohash_encode(t.gd_jd, t.gd_wd) as gd_geohash,--gd geohash
      substr(dwd_sp_dev:geohash_encode(t.gd_jd, t.gd_wd),1,4) as gd_geohash4, --gd geohash4位
      substr(dwd_sp_dev:geohash_encode(t.gd_jd, t.gd_wd),1,5) as gd_geohash5, --gd geohash5位
      substr(dwd_sp_dev:geohash_encode(t.gd_jd, t.gd_wd),1,6) as gd_geohash6, --gd geohash6位
      to_char(getdate(),'yyyymmddhhmiss') as dwd_loadtime, --DWD新增时间
      to_char(getdate(),'yyyymmddhhmiss') as dwd_updatetime, --DWD更新时间
      '1' as dwd_yxbz --DWD有效标志
from(
      select 
            dwd_sp_dev:bcconvert(t0.jkdbh) as jkdbh,--监控点编号(按ga408.3)
            dwd_sp_dev:bcconvert(t0.mc) as mc,--监控点名称
            dwd_sp_dev:bcconvert(t0.sm) as sm,--俗称
            dwd_sp_dev:bcconvert(t0.dwdm) as dwdm,--单位代码(前6位为行政区划)
            dwd_sp_dev:bcconvert(t0.gxsj) as gxsj,--更新时间(精确到天)
            dwd_sp_dev:bcconvert(t0.js) as js,--介绍
            dwd_sp_dev:bcconvert(t0.bz) as bz,--备注
            dwd_sp_dev:bcconvert(t0.jllx) as jllx,--记录类型(0正常,1禁用,2删除)
            dwd_sp_dev:bcconvert(t0.jfbh) as jfbh,--机房编号
            dwd_sp_dev:bcconvert(t0.jkdlx) as jkdlx,--监控点类型 0-省际 1-市际 2-其它
            dwd_sp_dev:bcconvert(t0.jkdjb) as jkdjb,--监控点级别 0-一级 1-二级 2-三级
            dwd_sp_dev:bcconvert(t0.flag) as flag,--监控点图标状态 0-图标已存在 1-图标不存在
            dwd_sp_dev:bcconvert(t0.jkdzt) as jkdzt,--监控点故障状态 0-正常 1- 故障 2-未知
            dwd_sp_dev:bcconvert(t0.pyjx) as pyjx,--拼音简写
            dwd_sp_dev:bcconvert(t0.dym1) as dym1,--对应码1
            dwd_sp_dev:bcconvert(t0.dym2) as dym2,--
            dwd_sp_dev:bcconvert(t0.dym3) as dym3,--
            dwd_sp_dev:bcconvert(t0.dym4) as dym4,--
            dwd_sp_dev:bcconvert(t0.dym5) as dym5,--
            dwd_sp_dev:bcconvert(t0.ddlx) as ddlx,--01:单条道路点位 02:多条道路点位
            dwd_sp_dev:bcconvert(t0.xzqh) as xzqh,--行政区划
            dwd_sp_dev:bcconvert(t0.jstz) as jstz,--建设投资(1-自建 2-租用)
            dwd_sp_dev:bcconvert(t0.dwlx) as dwlx,--点位类别
            dwd_sp_dev:bcconvert(t0.jrqgjcbk) as jrqgjcbk,--接入全国缉查布控(0-需接入 1-不接入 2-已接入
            dwd_sp_dev:bcconvert(t0.fxctq) as fxctq,--非现场监控点停启专用 0启用 1停用
            dwd_sp_dev:bcconvert(t0.ssxm) as ssxm,--监控点所属项目
            dwd_sp_dev:bcconvert(t0.kms) as kms,--千米数(监控点编号第6到9位)
            dwd_sp_dev:bcconvert(t0.zms) as zms,--米数(监控点编号第10到12位)
            dwd_sp_dev:bcconvert(t0.jd) as pgis_jd, --pgis经度
            dwd_sp_dev:bcconvert(t0.wd) as pgis_wd, --pgis纬度
            dwd_sp_dev:geohash_encode(cast(dwd_sp_dev:bcconvert(t0.jd) as double),cast(dwd_sp_dev:bcconvert(t0.wd) as double)) as pgis_geohash, --pgis geohash
            get_gd_jd(cast(dwd_sp_dev:bcconvert(t0.wd) as double),cast(dwd_sp_dev:bcconvert(t0.jd) as double)) as gd_jd, -- 高德经度
            get_gd_wd(cast(dwd_sp_dev:bcconvert(t0.wd) as double),cast(dwd_sp_dev:bcconvert(t0.jd) as double)) as gd_wd, -- 高德纬度
            row_number() over (partition by t0.jkdbh order by  desc) as rn 
      from ods_wp_jj_road_place_dd t0 
      where fq_day='${bizday}'
      ) t
where t.rn = 1
SQL

 6.更新删除数据 

a)如果需要更新(UPDATE)数据:

只能把源分区/表数据导入到新分区/表,在导入过程中执行相应的更新逻辑。新分区/表可以与源相同,即就地更新;

b) 如果需要删除(DELETE)的数据:

可以通过删除“DROPTABLE table_name;”表达到数据删除目的;
非分区表可以通过 “TRUNCATE TABLE table_name;”语句清空表数据;
分区表可以通过“ALTER TABLE table_name DROP IF EXISTS PARTITION(分区名=‘具体分区值’)”删除分区达到删除整个分区数据的目的;
也可以通过INSERT+WHERE条件把需要的数据导入到另一张新分区/表中或就地更新,INSERT语句支持源和目的表是同一张表:
例如insert overwrite table sale_detail select * from sale_detail where name=”mengyonghui”。

7.多表JOIN (只支持等值连接)

  可以使用括号来区分JOIN

SELECT tmp.name, t3.id
FROM (
    SELECT t1.*, t2.*
    FROM t1
        JOIN t2 ON t1.tid = t2.qid
) tmp
    JOIN t3 ON tmp.tid = t3.id
----
SELECT *
FROM t1
    JOIN t2 ON t1.tid = t2.qid
    JOIN t3 ON t1.tid = t3.id

  当然我认为是和连续JOIN没差的,详情测试实验参考:https://blog.csdn.net/mccand1234/article/details/51734713

  【NOT IN的改造】

--正确写法                      
SELECT  *
FROM    dbo.Table_A AS a
WHERE   a.ID NOT IN ( SELECT    b.ID
                      FROM      dbo.Table_B AS b
                      WHERE     b.ID IS NOT NULL ) --排除NULL值参与运算符比较
                      
--建议修改为关联查询方法                                            
--正确写法1             
SELECT  *
FROM    dbo.Table_A AS a
WHERE   NOT EXISTS ( SELECT *
                     FROM   dbo.Table_B AS b
                     WHERE  a.ID = b.ID )
--正确写法2                     
SELECT  *
FROM    dbo.Table_A AS a
        LEFT OUTER JOIN dbo.Table_B AS b ON a.ID = b.ID
WHERE   b.ID IS NULL

 官网推荐写法:

select * from a where aid not in (select id from b)
改成如下语句:
select a.* from a left outer join (select distinct id from b) bb on a.id=bb.id where bb.id is null;

 

8.MAPJOIN左连接左表不能是小表 (基于map-reduce模型原理,使用 /* +MAPJOIN(u) */,且只有MAPJOIN支持笛卡尔积,其他不支持)

 

elect /*+MAPJOIN(smallTableTwo)*/ idOne, idTwo, value FROM
  ( select /*+MAPJOIN(smallTableOne)*/ idOne, idTwo, value FROM
    bigTable JOIN smallTableOne on (bigTable.idOne = smallTableOne.idOne)                                                  
  ) firstjoin                                                            
  JOIN                                                                 
  smallTableTwo ON (firstjoin.idTwo = smallTableTwo.idTwo)  

 

 

 

9.聚合函数和窗口函数不同点是聚合函数同一分组只产生一条记录,而窗口函数同一分组中每一行记录都产出一条记录

  要使用窗口函数的where条件,需要在外面嵌套一层子查询(where条件先于select查询)

  窗口函数简单应用(计数,并随即取一条数据):

 

SELECT t2.czrkxm
    , t2.czrkgmsfhm
    , t2.czrkjggj
    , t2.cnt
FROM (
    SELECT t1.czrkxm
        , t1.czrkgmsfhm
        , t1.czrkjggj
        , t1.cnt
        , ROW_NUMBER() OVER (PARTITION BY t1.czrkjggj ORDER BY cnt DESC) AS rn
    FROM (
        SELECT czrkxm
            , czrkgmsfhm
            , czrkjggj
            , COUNT(czrkjggj) OVER (PARTITION BY czrkjggj ORDER BY czrkxm) AS cnt
        FROM ods_gat_xx_czrk_qm
        WHERE trim(czrkjggj) <> '中国'
    ) t1
) t2
WHERE t2.rn = 1

 

10.ORDER BY必须和LIMIT一起使用,必须!(并且ORDER BY后面的字段需要和别名保持一致,原理参考SQL执行顺序先SELECT后ORDER BY)

11.UNION ALL不支持直接将两个顶级的子查询连接作为结果 (如果要使用,请在外面再嵌套一层子查询进行查询)

12.当CASE...WHEN作为参数时,后面END不能接AS别名,直接跟括号即可

13.topK思路 (查询频率最高的等)

  对每个IP访问最多的3个URL,步骤与思路如下:

    1.GROUP BY对每个IP分组,统计URL访问次数

    2.使用ROW_NUMBER()开窗降序计算行号(顺序不重复计算,RANK()为跳跃排序)

    3.查询出前3名URL

SELECT ip, count_url
FROM (
    SELECT ip, row_number() OVER (PARTITION BY ip ORDER BY count_url DESC) AS rn
    FROM (
        SELECT t1.ip, COUNT(t1.url) AS count_url
        FROM t1
        GROUP BY t1.ip
    ) t1
) t2
WHERE rn <= 3
View Code

  // 请务必注意子查询必须有别名!

14.UDF类型 (入参出参必须是对象类型,例如Long,而不是long)

15.杂项注意事项

   (1)数据表分区管理,防止数据的相互污染

  (2)SELECT需要的字段而不是SELECT *

  (3)谨慎使用ALTER关键字,尽量“谋定而后动”

  (4)注意数据产出时间,注意依赖关系

16.MaxComputer中的连接

  1.普通JOIN

    也就是我们常说的内连接,数学上的“交集”

SELECT a.name,b.age
FROM a
JOIN b
ON a.id = b.id

  图示:

    

  2.左连接

    包含左表的全集

SELECT a.name,b.age
FROM a
LEFT JOIN b
ON a.id = b.id

  图示:

    

   3.差集

    差集其实就是左集-交集,也就是左集连接中右表没有连接上的

SELECT a.name,b.age
FROM a
LEFT JOIN b
ON a.id = b.id
WHERE b.id IS NULL

  图示:

    

 17.采用row_number()进行开窗处理取最新一条记录/进行去重等

  如果使用SELECT *可能报错,根据错误信息筛选字段内容即可!

SELECT p.izid
FROM (
    SELECT *
        , ROW_NUMBER() OVER (PARTITION BY t.id ORDER BY t.time_stamp DESC) AS rn
    FROM (
        SELECT *
        FROM t2
        UNION ALL
        SELECT *
        FROM t3
    ) t
) p
WHERE p.rn = 1

   还有例如按照长度分组,取一条示例数据(按其他分组同理)

 

SELECT *
FROM (
    SELECT *
        , ROW_NUMBER() OVER (PARTITION BY t.len_sr ORDER BY t.sr DESC) AS rn
    FROM (
        SELECT sr
            , LENGTH(sr) AS len_sr
        FROM  t
    ) t
) p
WHERE p.rn = 1

 

 

 

18.实现类似MySQL的group_concat功能

SELECT b.jt_id
    , concat_ws(',', collect_set(a.zjhm)) AS jtcyxxlb --collect-set或者collect-list根据实际情况选择使用
FROM dws_family_relation b
GROUP BY b.jt_id

 19.实现两表联合,有冲突时取其中一表数据

WITH S1 AS(
    SELECT *, 'A' AS Flag  FROM Supply
), S2 AS(
    SELECT *, 'B' AS Flag FROM supplyCopy
), S3 AS(
    SELECT * FROM S1 UNION ALL
    SELECT * FROM S2
)

SELECT p.*
FROM (
    SELECT *
        , ROW_NUMBER() OVER (PARTITION BY S3.id ORDER BY S3.Flag ASC) AS rn
    FROM S3
) p
WHERE p.rn = 1
-------------------------
WITH S1 AS(
    SELECT *, 'A' AS Flag  FROM Supply
), S2 AS(
    SELECT *, 'B' AS Flag FROM supplyCopy
), S3 AS(
    SELECT * FROM S1 UNION ALL
    SELECT * FROM S2
), S4 AS(
    SELECT SUPPLYID, MIN(FLAG) AS minFlag FROM S3 GROUP BY supplyId
)

SELECT 
    S3.* FROM S3 JOIN S4 
ON 
    S3.supplyId = S4.supplyId AND S3.Flag = S4.minFlag

 20.JOIN OR条件的语法改写

select call_bill.bill_id,call_bill.caller,phonearea.province,phonearea.city ,phonearea.isp 
from call_bill left outer join phonearea
on (substr(call_bill .caller,1,7) = phonearea_test.prefix
or substr(call_bill .caller,1,3)=phonearea.code);
or substr(call_bill .caller,1,4)=phonearea.code; 

--------------------- 

select id,caller,province,city,isp 
from ( select bi.bill_id id,bi.caller caller,ph.province province,ph.city city,ph.isp isp 
            from  call_bill bi left outer join phonearea ph on substr(bi.caller,1,7) = ph.prefix 
        UNION ALL select bi.bill_id id,bi.caller caller,ph.province province,ph.city city,ph.isp isp 
                from  call_bill bi left outer join phonearea ph on substr(bi.caller,1,3)=ph.code 
        UNION ALL select bi.bill_id id,bi.caller caller,ph.province province,ph.city city,ph.isp isp 
                from  call_bill bi left outer join phonearea ph on substr(bi.caller,1,4)=ph.code
) tmp_tmp GROUP BY id,caller,called,province,city,isp;

   【子查询写法】

解决类似NOT IN带逻辑与的:

WHERE a.mfsfzh NOT IN (SELECT b.sfzhm
        FROM  b) 
    AND a.fsfzh NOT IN (SELECT c.sfzhm
        FROM  c);

 

SELECT *
FROM (
    SELECT *
    FROM a
    LEFT OUTER JOIN  b
    ON a.msfzh = b.sfzhm
    WHERE b.sfzhm IS NULL
) c
LEFT OUTER JOIN  d
ON c.fsfzh = d.sfzhm
WHERE d.sfzhm IS NULL

 21.UDTF的定义与使用

  注意lateral view使用场景(场景)

  参考:http://www.cnblogs.com/ggjucheng/archive/2013/02/01/2888819.html

22.插入少量测试数据

  仅支持INSERT INTO

drop table if exists srcp;
create table if not exists srcp (key string ,value bigint) partitioned by (p string);
insert into table srcp partition (p='abc') values ('a',1),('b',2),('c',3);

  //与MySQL相似的写法:VALUES

23.常用限制

  https://help.aliyun.com/document_detail/51823.html?spm=5176.11065259.1996646101.searchclickresult.3f024071jdaXT7

24.distinct

  对于多个字段与单个字段的distinct原理:

  DISTINCT 表示对后面的所有参数的拼接取 不重复的记录,相当于 把 SELECT 表达式的项 拼接起来选唯一值。

  参考原理:https://blog.csdn.net/u010003835/article/details/79154457

25.字段空值统计

  

SELECT SUM(CASE 
        WHEN XM IS NOT NULL THEN 0
        ELSE  1
    END) AS 姓名
    , SUM(CASE 
        WHEN SFZH IS NOT NULL THEN 0
        ELSE  1
    END) AS 身份证
    , SUM(CASE 
        WHEN ZZZH IS NOT NULL THEN 0
        ELSE  1
    END) AS 居住证号
FROM t

 

转载于:https://www.cnblogs.com/jiangbei/p/9332516.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值