日期维度表
在mysql中建表并更新节假日信息,然后再导到hive上
建表
create table dim_date1(
datekey varchar(10),
dateymd varchar(10),
datech varchar(20),
dimyear varchar(10),
quarterid int COMMENT '季度数字',
quartercH varchar(10),
yearmonth varchar(10),
dimmonth varchar(10),
dimday varchar(10) COMMENT '一年中的第几天',
dimmonthday varchar(10) comment '一个月中的第几天',
dimweek varchar(10) COMMENT '一年中的第几周 星期一是星期的第一天',
yearweek varchar(10) COMMENT '年周',
dimweek1 varchar(10) COMMENT '一年中的第几周 星期天是星期的第一天',
dimyear1 varchar(10) COMMENT '对应dimweek1的年份',
dimweekday varchar(10) comment '一周的中的第几天',
weekdaych varchar(10) COMMENT '星期几',
holidayname varchar(10) COMMENT '节假日名称',
isdayoff int COMMENT '是否放假 1表示放假,0表示不放假 2表示调班'
#mysql5版本的数据库默认charset是latin1 不改的话存储过程插入中文字段会报错
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
存储过程初始化日期维度
drop procedure if exists f_dim_day;
create procedure f_dim_day(in start_date VARCHAR(20),in end_date VARCHAR(20))
begin
declare i int;
set i=0;
DELETE from dim_date1;
while DATE_FORMAT(start_date,'%Y-%m-%d %H:%i:%s') < DATE_FORMAT(end_date,'%Y-%m-%d %H:%i:%s') do
INSERT into dim_date1
(datekey,dateymd ,datech,dimyear,quarterid,quarterch ,yearmonth,dimmonth,dimday,dimmonthday,dimweek,yearweek,dimyear1,dimweek1,dimweekday,weekdaych,holidayname,isdayoff)
SELECT
DATE_FORMAT(STR_TO_DATE(start_date,'%Y-%m-%d %H:%i:%s'),'%Y%m%d') datekey,
DATE_FORMAT(STR_TO_DATE(start_date,'%Y-%m-%d %H:%i:%s'),'%Y-%m-%d') dateymd,
DATE_FORMAT(STR_TO_DATE(start_date,'%Y-%m-%d %H:%i:%s'),'%Y年%m月%d日') datech,
DATE_FORMAT(STR_TO_DATE(start_date,'%Y-%m-%d %H:%i:%s'),'%Y') dimyear,
quarter(STR_TO_DATE( start_date,'%Y-%m-%d %H:%i:%s')) quarterid,
CONCAT(quarter(STR_TO_DATE(start_date,'%Y-%m-%d %H:%i:%s')),'季度') quarterch,
DATE_FORMAT(STR_TO_DATE(start_date,'%Y-%m-%d %H:%i:%s'),'%Y%m') yearmonth,
CONVERT(DATE_FORMAT(STR_TO_DATE(start_date,'%Y-%m-%d %H:%i:%s'),'%m'),
SIGNED) dimmonth,
DAYOFYEAR(STR_TO_DATE(start_date,'%Y-%m-%d %H:%i:%s')) dimday,
CONVERT( DATE_FORMAT(STR_TO_DATE(start_date,'%Y-%m-%d %H:%i:%s'),'%d'
),SIGNED) dimmonthday,
CONVERT(DATE_FORMAT(STR_TO_DATE(start_date,'%Y-%m-%d %H:%i:%s'),'%u'),
SIGNED) dimweek,
DATE_FORMAT(STR_TO_DATE(start_date,'%Y-%m-%d %H:%i:%s'),'%Y%u') yearweek,
case CONVERT(DATE_FORMAT(STR_TO_DATE(start_date,'%Y-%m-%d %H:%i:%s'),'%U'),SIGNED)+1
when 53 then
DATE_FORMAT(STR_TO_DATE(start_date,'%Y-%m-%d %H:%i:%s'),'%Y')+1
else
DATE_FORMAT(STR_TO_DATE(start_date,'%Y-%m-%d %H:%i:%s'),'%Y')
end
dimyear1,
case CONVERT(DATE_FORMAT(STR_TO_DATE(start_date,'%Y-%m-%d %H:%i:%s'),'%U'),SIGNED)+1
when 53 then
1
else
CONVERT(DATE_FORMAT(STR_TO_DATE(start_date,'%Y-%m-%d %H:%i:%s'),'%U'),SIGNED)+1
end
dimweek1,
DAYOFWEEK(STR_TO_DATE(start_date,'%Y-%m-%d %H:%i:%s')) dimweekday,
case DAYOFWEEK(STR_TO_DATE(start_date,'%Y-%m-%d %H:%i:%s')) when 1 then '星期日' when 2 then '星期一' when 3 then '星期二' when 4 then '星期三' when 5 then '星期四' when 6 then '星期五' when 7 then '星期六' end weekdaych,
'' holidayname,
0 isdayoff
from dual;
set i=i+1;
set start_date = DATE_FORMAT(date_add(STR_TO_DATE(start_date,'%Y-%m-%d %H:%i:%s'),interval 1 day),'%Y-%m-%d');
end while;
end;
#调用存储过程
call f_dim_day('2018-01-01 00:00:00','2024-01-02 00:00:00');
节假日放假信息更新
研究百度日历的请求,得到日历url返回的json,(以前没有暴露这个url还得通过selenium去模拟请求, 网上有的日历接口也不那么友好),对我们有用的是以下字段
python爬取
# -*- coding: utf-8 -*-
# @Time : 2022/5/6 19:50
# @Author : bailin
# @Description: 获取百度上的日历信息更新日期维度表
import json
import requests
import pymysql
# 插入数据库
CONN = pymysql.connect("localhost", "root", "gogobi", "bidm", charset='utf8')
def insertDB(sql):
cursor = CONN.cursor()
try:
cursor.execute(sql)
CONN.commit()
print(sql)
except Exception as e:
print(sql)
print(e)
#获取日历上每天的节假日信息
def getMonthDayInfo(url):
html = requests.get(url).text
#print(html)
# 转化成字典
str1 = html.replace('/**/op_aladdin_callback1651906546123(', '').replace(');', '')
#字符串转字典
dict_test = json.loads(str1)
for dayinfs in dict_test['data']:
for dayinf in dayinfs["almanac"]:
#拼接update的sql
update_sql ="update dim_date1 set holidayname ='"+try_except(dayinf,"term")+"', isdayoff='"+ try_except(dayinf, "status") \
+"' where dimyear="+try_except(dayinf,"year")+" and dimmonth="+try_except(dayinf,"month")+" and dimmonthday="+try_except(dayinf,"day");
insertDB(update_sql)
#处理字典中没有索引的值 防止程序报错
def try_except(dict,index):
try:
return str(dict[index])
except Exception as e:
if index=="status":
return '0'
else:
return ''
if __name__ == '__main__':
#月份 年份在url里手动改
counter = 1
while counter <= 12:
url = "https://sp1.baidu.com/8aQDcjqpAAV3otqbppnN2DJv/api.php?tn=wisetpl&format=json&resource_id=39043&query=2022年"+str(counter)+"月&t=1651906546123&cb=op_aladdin_callback1651906546123"
getMonthDayInfo(url)
counter += 1
#将不调班的周末置位放假
update dim_date1 set isdayoff = 1 where weekdaych in ('星期六','星期天' ) and isdayoff <>2;
验证
转存到数仓
create external table if not exists dim_date(
datekey string,
dateymd string,
datech string,
dimyear string,
quarterid int COMMENT '季度数字',
quartercH string,
yearmonth string,
dimmonth string,
dimday string COMMENT '一年中的第几天',
dimmonthday string comment '一个月中的第几天',
dimweek string COMMENT '一年中的第几周 星期一是星期的第一天',
yearweek string COMMENT '年周',
dimweek1 string COMMENT '一年中的第几周 星期天是星期的第一天',
dimyear1 string COMMENT '对应dimweek1的年份',
dimweekday string comment '一周的中的第几天',
weekdaych string COMMENT '星期几',
holidayname string COMMENT '节假日名称',
isdayoff int COMMENT '是否放假 1表示放假,0表示不放假 2表示调班'
)row format delimited fields terminated by '\t'
stored as textfile;
sqoop导入
sqoop import \
--connect jdbc:mysql://hadoop102:3306/bidm \
--username root \
--password gogobi \
--table dim_date1 \
--num-mappers 1 \
--hive-import \
--fields-terminated-by "\t" \
--hive-overwrite \
--hive-table myhive.dim_date