Hive生成超级日期维度表

创建日期维表

--日粒度维表
drop table dim.dim_date;
CREATE TABLE IF NOT EXISTS dim.dim_date(
     `datetimestr`    STRING COMMENT '当前时间(yyyy-MM-dd HH:mm:ss)'
    ,`yearstr`        STRING COMMENT '当前年份'
    ,`quarterid`      BIGINT COMMENT '当前季度'
    ,`quarter_cn`     STRING COMMENT '当前季度中文'
    ,`monthstr`       STRING COMMENT '当前月份'
    ,`weekofyearid`   BIGINT COMMENT '当年第几周'
    ,`dayofyearid`    BIGINT COMMENT '当年第几天'
    ,`dayofmonthstr`  STRING COMMENT '当月日期(dd)'
    ,`dayofweekid`    BIGINT COMMENT '当前周几(0-6)'
    ,`dayofweek_cn`   STRING COMMENT '当前周中文'
    --,`hourstr`        STRING COMMENT '当前小时(HH)'
    --,`minutestr`      STRING COMMENT '当前分钟(mm)'
    ,`workday_flag`   BIGINT COMMENT '是否为工作日(1,是;0,不是)'
    ,`create_time`    STRING COMMENT '数据创建日期(yyyy-MM-dd HH:mm:ss.SSS)'
    )
COMMENT '日期粒度度维表'
--PARTITIONED BY(yearstr STRING COMMENT '当前年分区(yyyy)',monthstr STRING COMMENT '当前月分区(MM)')
ROW FORMAT DELIMITED
    FIELDS TERMINATED BY '\t'
    NULL DEFINED AS ''
    STORED AS ORC
    TBLPROPERTIES('orc.compress' = 'SNAPPY','transactional'='false');
    
--日分钟粒度维表
drop table dim.dim_datetime;
CREATE TABLE IF NOT EXISTS dim.dim_datetime(
     `timestr`        STRING COMMENT '当前时间(HH:mm:ss)'
    ,`hourstr`        STRING COMMENT '当前小时(HH)'
    ,`minutestr`      STRING COMMENT '当前分钟(mm)'
    )
COMMENT '日分钟粒度维表'
ROW FORMAT DELIMITED
    FIELDS TERMINATED BY '\t'
    NULL DEFINED AS ''
    STORED AS ORC
    TBLPROPERTIES('orc.compress' = 'SNAPPY','transactional'='false');

生成数据

--生成日粒度维表数据
WITH datetime_t AS(
SELECT FROM_UNIXTIME(UNIX_TIMESTAMP('2000-01-01 00:00:00') + idx_t.pos*60) as datetimestr
FROM
       (
           SELECT  
               POSEXPLODE(
                   SPLIT(
                       REPEAT('o',CAST((UNIX_TIMESTAMP('2000-01-01 23:59:00') - UNIX_TIMESTAMP('2000-01-01 00:00:00'))/60 AS INT))
                       ,'o'
                   )
               )
       ) idx_t
),
date_t AS (
select date_add('2000-01-01',idx_t.pos) as datetimestr
from
       (
           select  
               posexplode(
                   split(
                       repeat('o',datediff(from_unixtime(unix_timestamp('2040-12-31 00:00:00'),'yyyy-MM-dd'), '2000-01-01'))
                       ,'o'
                   )
               )
       ) idx_t
),
tmp AS(
select
       datetimestr                                                    AS datetimestr
      ,substr(datetimestr,1,4)                                        AS yearstr
      ,CASE  WHEN MONTH(datetimestr) <= 3  then 1
             WHEN MONTH(datetimestr) <= 6  then 2
             WHEN MONTH(datetimestr) <= 9  then 3
             ELSE 4
       END                                                            AS quarterid
      ,substr(datetimestr,6,2)                                        AS monthstr
      ,WEEKOFYEAR(datetimestr)                                        AS weekofyearid
      ,DATEDIFF(datetimestr,CONCAT(YEAR(datetimestr),'-01-01')) + 1   AS dayofyearid
      ,substr(datetimestr,9,2)                                        AS dayofmonthstr
      ,PMOD(DATEDIFF(datetimestr, '2000-01-01')-1, 7)                 AS dayofweekid
      --,substr(datetimestr,12,2)                                       AS hourstr
      --,substr(datetimestr,15,2)                                       AS minutestr
FROM  date_t
)
INSERT OVERWRITE TABLE dim.dim_date --PARTITION(yearstr,monthstr)
SELECT
       datetimestr
      ,yearstr
      ,quarterid
      ,CASE  WHEN quarterid = 1   then '第一季度'
             WHEN quarterid = 2   then '第二季度'
             WHEN quarterid = 3   then '第三季度'
             WHEN quarterid = 4   then '第四季度'
       END                                                            AS quarter_cn
      ,monthstr
      ,weekofyearid
      ,dayofyearid
      ,dayofmonthstr
      ,dayofweekid
      ,CASE  WHEN dayofweekid = 0  then '星期日'
             WHEN dayofweekid = 1  then '星期一'
             WHEN dayofweekid = 2  then '星期二'
             WHEN dayofweekid = 3  then '星期三'
             WHEN dayofweekid = 4  then '星期四'
             WHEN dayofweekid = 5  then '星期五'
             WHEN dayofweekid = 6  then '星期六'
       END                                                            AS dayofweek_cn
      --,hourstr
      --,minutestr
      ,CASE  WHEN dayofweekid = 0 or dayofweekid = 6 then 0
             ELSE 1
       END                                                            AS workday_flag
      ,FROM_UTC_TIMESTAMP(CURRENT_TIMESTAMP,'GMT+8')                  AS create_time
      --,yearstr
      --,monthstr
FROM  tmp;

--生成分钟粒度维表数据
WITH datetime_t AS(
SELECT FROM_UNIXTIME(UNIX_TIMESTAMP('2000-01-01 00:00:00') + idx_t.pos*60) as datetimestr
FROM
       (
           SELECT  
               POSEXPLODE(
                   SPLIT(
                       REPEAT('o',CAST((UNIX_TIMESTAMP('2000-01-01 23:59:00') - UNIX_TIMESTAMP('2000-01-01 00:00:00'))/60 AS INT))
                       ,'o'
                   )
               )
       ) idx_t
),
tmp AS(
select
       substr(datetimestr,12)                                         AS timestr
      ,substr(datetimestr,12,2)                                       AS hourstr
      ,substr(datetimestr,15,2)                                       AS minutestr
FROM  datetime_t
)
INSERT OVERWRITE TABLE dim.dim_datetime
select 
       timestr
      ,hourstr
      ,minutestr
from  tmp 

变更节假日

变更节假日sql

INSERT INTO TABLE dim.dim_date 
SELECT 
        datetimestr  
       ,yearstr
       ,quarterid           
       ,quarter_cn   
       ,monthstr       
       ,weekofyearid
       ,dayofyearid 
       ,dayofmonthstr
       ,dayofweekid 
       ,dayofweek_cn
       ,0 AS workday_flag --变更为非工作日
       --,1 AS workday_flag --变更为工作日
       ,FROM_UTC_TIMESTAMP(CURRENT_TIMESTAMP,'GMT+8')                 AS create_time
FROM   dim.dim_date 
where  yearstr='2000' and monthstr='01' and dayofmonthstr='31';

变更节假日脚本

变更节假日脚本(单个)

#/bin/bash env
#update_workday_flag.sh

USAGE="Usage: $0 [yearstr] [monthstr] [dayofmonthstr] [workday_flag]
       e.g:   $0 2000 01 01 0"

function main(){
    if [[ $# -lt 4 ]] ; then
        echo "${USAGE}";
        exit 1;
    fi
    
    SQL_UPDATE="INSERT INTO TABLE cdm.dim_date 
         SELECT 
             datetimestr  
            ,yearstr
            ,quarterid           
            ,quarter_cn   
            ,monthstr       
            ,weekofyearid
            ,dayofyearid 
            ,dayofmonthstr
            ,dayofweekid 
            ,dayofweek_cn
            ,${4} AS workday_flag
            ,FROM_UTC_TIMESTAMP(CURRENT_TIMESTAMP,'GMT+8')                 AS create_time
        FROM   cdm.dim_date 
        where  yearstr='${1}' and monthstr='${2}' and dayofmonthstr='${3}'"
    
    SQL_QUERY="SELECT 
         datetimestr  
        ,yearstr
        ,quarterid           
        ,quarter_cn   
        ,monthstr       
        ,weekofyearid
        ,dayofyearid 
        ,dayofmonthstr
        ,dayofweekid 
        ,dayofweek_cn
        ,workday_flag
        ,create_time
    FROM   cdm.dim_date 
    where  yearstr='${1}' and monthstr='${2}' and dayofmonthstr='${3}'"
    
    beeline --silent=true -n hive -p 123456 -u jdbc:hive2://hadoop01:10000/cdm  -e "${SQL_UPDATE};${SQL_QUERY}" 

}

main "$@"

变更节假日脚本(批量)

#/bin/bash env
#update_workday_flag_batch.sh

USAGE="agrs:  [yearstr] [monthstr] [dayofmonthstr] [workday_flag]
       e.g:   2000 01 01 0"

function generate_sql(){

    if [[ $# -lt 4 ]] ; then
        echo "Invalid arguments !!!"
        echo "${USAGE}";
        exit 1;
    fi
    
    SQL_UPDATE="INSERT INTO TABLE cdm.dim_date 
         SELECT 
             datetimestr  
            ,yearstr
            ,quarterid           
            ,quarter_cn   
            ,monthstr       
            ,weekofyearid
            ,dayofyearid 
            ,dayofmonthstr
            ,dayofweekid 
            ,dayofweek_cn
            ,${4} AS workday_flag
            ,FROM_UTC_TIMESTAMP(CURRENT_TIMESTAMP,'GMT+8')                 AS create_time
        FROM   cdm.dim_date 
        where  yearstr='${1}' and monthstr='${2}' and dayofmonthstr='${3}'"
    
    SQL_QUERY="SELECT 
         datetimestr  
        ,yearstr
        ,quarterid           
        ,quarter_cn   
        ,monthstr       
        ,weekofyearid
        ,dayofyearid 
        ,dayofmonthstr
        ,dayofweekid 
        ,dayofweek_cn
        ,workday_flag
        ,create_time
    FROM   cdm.dim_date 
    where  yearstr='${1}' and monthstr='${2}' and dayofmonthstr='${3}'"
}

for line in `cat days_*.csv`
do  
    OLD_IFS="${IFS}"
    IFS=","
    ARR=(${line})
    IFS="{OLD_IFS}"
    generate_sql ${ARR[@]}
    SQL="${SQL};${SQL_UPDATE};${SQL_QUERY}" 
done

beeline --silent=true -n hive -p 123456 -u jdbc:hive2://hadoop01:10000/cdm  -e "${SQL}" 

变更日期数据

tee days_2022.csv <<EOF
2022,01,03,0
2022,01,29,1
2022,01,30,1
2022,01,31,0
2022,02,01,0
2022,02,02,0
2022,02,03,0
2022,02,04,0
2022,04,02,1
2022,04,04,0
2022,04,05,0
2022,04,24,1
2022,05,02,0
2022,05,03,0
2022,05,04,0
2022,05,07,1
2022,06,03,0
2022,09,12,0
2022,10,03,0
2022,10,04,0
2022,10,05,0
2022,10,06,0
2022,10,07,0
2022,10,08,1
2022,10,09,1
EOF

使用日期维表

--使用维表(日期粒度)
WITH tmp AS(
SELECT 
        datetimestr
       ,yearstr
       ,quarterid   
       ,quarter_cn  
       ,monthstr
       ,weekofyearid
       ,dayofyearid 
       ,dayofmonthstr
       ,dayofweekid 
       ,dayofweek_cn
       ,workday_flag
       ,DENSE_RANK() OVER(PARTITION BY yearstr,monthstr,dayofmonthstr ORDER BY create_time DESC) AS num
FROM   dim.dim_date
--WHERE yearstr='yyyy' and monthstr='MM'
WHERE yearstr='2000' and monthstr='01'
)
SELECT 
        datetimestr
       ,yearstr      
       ,quarterid   
       ,quarter_cn  
       ,monthstr     
       ,weekofyearid
       ,dayofyearid 
       ,dayofmonthstr
       ,dayofweekid 
       ,dayofweek_cn
       ,workday_flag
FROM   tmp 
WHERE  num=1

--使用维表(小时粒度)
WITH tmp AS(
SELECT 
        datetimestr
       ,yearstr
       ,quarterid   
       ,quarter_cn  
       ,monthstr
       ,weekofyearid
       ,dayofyearid 
       ,dayofmonthstr
       ,dayofweekid 
       ,dayofweek_cn
       ,workday_flag
       ,DENSE_RANK() OVER(PARTITION BY yearstr,monthstr,dayofmonthstr ORDER BY create_time DESC) AS num
FROM   dim.dim_date
--WHERE yearstr='yyyy' and monthstr='MM'
WHERE yearstr='2000' and monthstr='01'
),
date_t AS(
SELECT 
        datetimestr
       ,yearstr      
       ,quarterid   
       ,quarter_cn  
       ,monthstr     
       ,weekofyearid
       ,dayofyearid 
       ,dayofmonthstr
       ,dayofweekid 
       ,dayofweek_cn
       ,workday_flag
FROM   tmp 
WHERE  num=1
),
datetime_t AS(
SELECT 
        hourstr
FROM   dim.dim_datetime 
group by hourstr
)
select 
        datetimestr
       ,yearstr      
       ,quarterid   
       ,quarter_cn  
       ,monthstr     
       ,b.weekofyearid
       ,b.dayofyearid
       ,b.dayofmonthstr
       ,b.dayofweekid
       ,b.dayofweek_cn
       ,a.hourstr
       ,b.workday_flag
from   datetime_t a,date_t b

--使用维表(分钟粒度)
WITH tmp AS(
SELECT 
        datetimestr
       ,yearstr
       ,quarterid   
       ,quarter_cn  
       ,monthstr
       ,weekofyearid
       ,dayofyearid 
       ,dayofmonthstr
       ,dayofweekid 
       ,dayofweek_cn
       ,workday_flag
       ,DENSE_RANK() OVER(PARTITION BY yearstr,monthstr,dayofmonthstr ORDER BY create_time DESC) AS num
FROM   dim.dim_date
--WHERE yearstr='yyyy' and monthstr='MM'
WHERE yearstr='2000' and monthstr='01'
),
date_t AS(
SELECT 
        datetimestr
       ,yearstr      
       ,quarterid   
       ,quarter_cn  
       ,monthstr     
       ,weekofyearid
       ,dayofyearid 
       ,dayofmonthstr
       ,dayofweekid 
       ,dayofweek_cn
       ,workday_flag
FROM   tmp 
WHERE  num=1
)
select 
        concat(b.datetimestr," ",a.timestr)  AS datetimestr
       ,b.yearstr      
       ,b.quarterid   
       ,b.quarter_cn  
       ,b.monthstr     
       ,b.weekofyearid
       ,b.dayofyearid
       ,b.dayofmonthstr
       ,b.dayofweekid
       ,b.dayofweek_cn
       ,a.hourstr
       ,a.minutestr
       ,b.workday_flag
from   dim.dim_datetime a,date_t b
  • 6
    点赞
  • 20
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
Hive中,你可以使用内置的日期函数和一些SQL技巧来生成连续的日期。以下是一些方法: 1. 使用序列生成器和日期函数 首先,你可以使用Hive中的序列生成器来生成一系列数字,然后使用日期函数将它们转换为日期。例如,要生成从2021年1月1日到2021年1月31日的日期,可以使用以下查询: ``` SELECT date_add('2021-01-01', seq) as date FROM (SELECT posexplode(split(space(31), ' '))) as s(seq, x) ``` 这将生成一个包含所有日期。 2. 使用LATERAL VIEW和日期函数 另一种方法是使用LATERAL VIEW和日期函数来生成连续的日期。例如,要生成从2021年1月1日到2021年1月31日的日期,可以使用以下查询: ``` SELECT date_add('2021-01-01', d) as date FROM (SELECT posexplode(split(space(31), ' '))) as s(seq, x) LATERAL VIEW explode(array(seq)) e as d ``` 这将生成一个与上面相同的包含所有日期。 3. 使用CTE和日期函数 最后,你还可以使用公共达式 (CTE) 和日期函数来生成连续的日期。例如,要生成从2021年1月1日到2021年1月31日的日期,可以使用以下查询: ``` WITH dates AS ( SELECT date_add('2021-01-01', n) AS date FROM (SELECT posexplode(split(space(31), ' '))) AS s(seq, x) LATERAL VIEW explode(array(seq)) e AS n ) SELECT * FROM dates ``` 这将生成一个与前两个查询相同的包含所有日期。 总之,以上是三种在Hive生成连续日期的方法。你可以根据需要选择其中一种。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值