公司最近要开始数仓建设,那数仓的一些基础维度表要先生成,这里介绍最重要的维度表之一日期维度表的生成。整个过程分三步走:
- 使用java8的日期类生成日期维度表所需要的各种字段;
- 将生成的记录使用springboot+mybatis批量插入到mysql中;
- 使用sqoop将mysql表中的数据导入到hive中;
一、mysql中设计日期维度表
CREATE TABLE `dim_date` (
`date_id` int(8) DEFAULT NULL COMMENT '日期数字格式,如:20200101',
`date` varchar(20) CHARACTER SET utf8 DEFAULT NULL COMMENT '日期字符串格式,如:2020-01-01',
`week` int(4) DEFAULT NULL COMMENT '周几的数字,如:1,2,3,4,5,6,7',
`week_cn` varchar(20) DEFAULT NULL COMMENT '周几的中文表示,如:星期一',
`year_week` varchar(20) DEFAULT NULL COMMENT '年周组合,如:2020-34',
`month` int(4) DEFAULT NULL COMMENT '月份数字,如:1,2...12',
`year_month` varchar(20) DEFAULT NULL COMMENT '年月组合,如:2020-01',
`quarter` int(4) DEFAULT NULL COMMENT '季度数字,如:1,2,3,4',
`year_quarter` varchar(20) DEFAULT NULL COMMENT '年季度组合,如:2020-Q1',
`year` int(4) DEFAULT NULL COMMENT '年数字,如:2020',
`day_in_year` int(4) DEFAULT NULL COMMENT '一年中的第几天,如:365',
`day_in_month` int(4) DEFAULT NULL COMMENT '一月中的第几天,如:28',
`is_first_day_in_month` varchar(20) DEFAULT NULL COMMENT '是否是月的第一天(Y:是;N:否)',
`is_last_day_in_month` varchar(20) DEFAULT NULL COMMENT '是否是月的最后一天(Y:是;N:否)',
`week_in_month` int(4) DEFAULT NULL COMMENT '月的第几周,如:1',
`week_in_year` int(4) DEFAULT NULL COMMENT '年的第几周,如:30',
`date_type` varchar(20) DEFAULT NULL COMMENT '日期类型:工作日:WORKDAY;国家法定节假日:HOLIDAY;周末休息日:WEEKEND;节假日调休:EXCHANGING_HOLIDAY'
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='日期维度表';
二、编写Java代码
1、编写日期类实体
public class DimDate {
private Integer dateId;
private String date;
private Integer week;
private String weekCn;
private String yearWeek;
private Integer month;
private String yearMonth;
private Integer quarter;
private String yearQuarter;
private Integer year;
private Integer dayInYear;
private Integer dayInMonth;
private String isFirstDayInMonth;
private String isLastDayInMonth;
private Integer weekInMonth;
private Integer weekInYear;
private String dateType;
}
2、生成日期类核心代码
代码中调用了第三方的节假日API接口,以获取节假日信息。
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.zstax.domain.DimDate;
import com.zstax.enums.DateTypeEnum;
import org.apache.commons.lang3.StringUtils;
import java.time.DayOfWeek;
import java.time.LocalDate;
import java.time.Period;
import java.time.format.DateTimeFormatter;
import java.time.temporal.TemporalAdjusters;
import java.time.temporal.WeekFields;
import java.util.*;
/**
* @author: ndf
* @date: 2021/5/20 11:36
* @description:
*/
public class GenerateDimDate {
private static Map<Integer, String> weekCNMap = new HashMap<>();
private static String API_URL = "http://api.tianapi.com/txapi/jiejiari/index";
static {
weekCNMap.put(1, "星期一");
weekCNMap.put(2, "星期二");
weekCNMap.put(3, "星期三");
weekCNMap.put(4, "星期四");
weekCNMap.put(5, "星期五");
weekCNMap.put(6, "星期六");
weekCNMap.put(7, "星期日");
}
/**
* 根据开始日期和结束日期返回dimDate实体列表
*
* @param startDate 开始日期格式:yyyy-MM-dd
* @param endDate 结束日期格式:yyyy-MM-dd
* @return
* @throws Exception
*/
public static List<DimDate> getDimDateListByRange(String startDate, String endDate) throws Exception {
if (StringUtils.isBlank(startDate) || StringUtils.isBlank(endDate)) {
throw new Exception("开始日期和结束日期不能为空!");
}
String[] startDateArr = startDate.split("-");
String[] endDateArr = endDate.split("-");
if (startDate.length() != 3 && endDate.length() != 3) {
throw new Exception("开始日期或结束日期格式不正确!");
}
if (DateUtils.getDistanceBetweenDates(startDate, endDate) < 0) {
throw new Exception("开始日期不能大于结束日期!");
}
int startYear = Integer.parseInt(startDateArr[0]);
int endYear = Integer.parseInt(endDateArr[0]);
if (startYear != endYear) {
throw new Exception("开始日期和结束日期不在同一个年份!");
}
Map<String, List<String>> holidayDatesMap = getHolidayDates(startYear);
return getDimDateListByRange(startDate, endDate, holidayDatesMap);
}
/**
* 通过年份获取一整年的数据
*
* @param year
* @return
*/
public static List<DimDate> getDimDateListByYear(int year) throws Exception {
int nowYear = LocalDate.now().getYear();
if (year > nowYear) {
throw new Exception("无法获取当前年份之后的节假日信息,故不能传入大于当前年的年份!");
}
Map<String, List<String>> holidayDatesMap = getHolidayDates(year);
String startDate = DateUtils.getFirstDayOfYear(year);
String endDate = DateUtils.getLastDayOfYear(year);
return getDimDateListByRange(startDate, endDate, holidayDatesMap);
}
/**
* 处理区间日期逻辑
*
* @param startDate
* @param endDate
* @param holidaysMap
* @return
*/
private static List<DimDate> getDimDateListByRange(String startDate, String endDate, Map<String, List<String>> holidaysMap) {
String currentDate = startDate;
List<DimDate> dimDates = new ArrayList<>();
while (DateUtils.getDistanceBetweenDates(currentDate, endDate) >= 0) {
DimDate dimDate = getDimDate(currentDate, holidaysMap);
dimDates.add(dimDate);
currentDate = DateUtils.getAfterDays(currentDate, 1);
}
dimDates.sort(Comparator.comparing(DimDate::getDateId));
return dimDates;
}
/**
* 获取dimDate实体
*
* @param date 将要处理的日期
* @param holidaysMap 节假日信息Map
* @return
*/
public static DimDate getDimDate(String date, Map<String, List<String>> holidaysMap) {
Integer dateId = Integer.parseInt(date.replace("-", ""));
DateTimeFormatter dtf = DateTimeFormatter.ofPattern("yyyy-MM-dd");
LocalDate ld = LocalDate.parse(date, dtf);
int week = ld.getDayOfWeek().getValue();
int dayInMonth = ld.getDayOfMonth();
int dayInYear = ld.getDayOfYear();
int month = ld.getMonthValue();
int year = ld.getYear();
LocalDate lastDayOfMonth = ld.with(TemporalAdjusters.lastDayOfMonth());
String isLastDayInMonth = Period.between(ld, lastDayOfMonth).getDays() == 0 ? "Y" : "N";
LocalDate firstDayOfMonth = ld.with(TemporalAdjusters.firstDayOfMonth());
String isFirstDayInMonth = Period.between(ld, firstDayOfMonth).getDays() == 0 ? "Y" : "N";
WeekFields weekFields = WeekFields.of(DayOfWeek.MONDAY, 1);
int weekInMonth = ld.get(weekFields.weekOfMonth());
int weekInYear = ld.get(weekFields.weekOfYear());
DimDate dimDate = new DimDate();
dimDate.setDateId(dateId);
dimDate.setDate(date);
dimDate.setWeek(week);
dimDate.setWeekCn(weekCNMap.get(week));
// 这里用常规周,未使用 周 iOS标准
// 关于两者区别可查看:https://blog.csdn.net/Liunene/article/details/83024212
String weekInYearStr = weekInYear < 10 ? "0" + weekInYear : weekInYear + "";
dimDate.setYearWeek(year + "-" + weekInYearStr);
dimDate.setMonth(month);
dimDate.setYearMonth(DateUtils.getMonthDate(date));
dimDate.setQuarter(getQuarterByMonth(month));
dimDate.setYearQuarter(year + "-Q" + dimDate.getQuarter());
dimDate.setYear(year);
dimDate.setDayInYear(dayInYear);
dimDate.setDayInMonth(dayInMonth);
dimDate.setIsFirstDayInMonth(isFirstDayInMonth);
dimDate.setIsLastDayInMonth(isLastDayInMonth);
dimDate.setWeekInMonth(weekInMonth);
dimDate.setWeekInYear(weekInYear);
/**********************判断日期类型*******************************/
List<String> holidayList = holidaysMap.get("HOLIDAY");
List<String> exchangingHolidayList = holidaysMap.get("EXCHANGING_HOLIDAY");
String dateType = "";
if (holidayList.contains(date)) {
dateType = DateTypeEnum.holiday.getCode();
} else if (exchangingHolidayList.contains(date)) {
dateType = DateTypeEnum.exchanging_holiday.getCode();
} else if (week == 6 || week == 7) {
dateType = DateTypeEnum.weekend.getCode();
} else {
dateType = DateTypeEnum.workday.getCode();
}
dimDate.setDateType(dateType);
return dimDate;
}
public static int getQuarterByMonth(int month) {
int quarter = -1;
if (month == 1 || month == 2 || month == 3) {
quarter = 1;
} else if (month == 4 || month == 5 || month == 6) {
quarter = 2;
} else if (month == 7 || month == 8 || month == 9) {
quarter = 3;
} else if (month == 10 || month == 11 || month == 12) {
quarter = 4;
}
return quarter;
}
public static void main(String[] args) throws Exception {
// System.out.println(getAfterDays("2020-01-01", 35));
getHolidayDates(2020);
// System.out.println(DateUtils.getDistanceBetweenDates("2020-03-03", "2020-02-08"));
List<DimDate> list = getDimDateListByYear(2021);
System.out.println(JSONObject.toJSONString(list));
}
public static Map<String, List<String>> getHolidayDates(int year) {
HashMap<String, List<String>> holidayDates = new HashMap<>();
String url = API_URL + "?key=你自己的key&date=" + year + "&type=1";
String resStr = HttpUtil.get(url);
JSONObject scrResponse = JSON.parseObject(resStr);
// 节假日
ArrayList<String> holidays = new ArrayList<>();
// 节假日调休
ArrayList<String> exchangingHolidays = new ArrayList<>();
if (200 == scrResponse.getInteger("code")) {
JSONArray list = scrResponse.getJSONArray("newslist");
for (Object o : list) {
String vacation = ((JSONObject) o).getString("vacation");
String remark = ((JSONObject) o).getString("remark");
if (StringUtils.isNotBlank(vacation)) {
String[] split = vacation.split("\\|");
holidays.addAll(Arrays.asList(split));
}
if (StringUtils.isNotBlank(remark)) {
String[] split = remark.split("\\|");
exchangingHolidays.addAll(Arrays.asList(split));
}
}
}
holidayDates.put("HOLIDAY", holidays);
holidayDates.put("EXCHANGING_HOLIDAY", exchangingHolidays);
return holidayDates;
}
}
3、使用springboot+mybatis插入到数据库中
这块儿的东西比较多,但是技术含量不高,所以不再展示;有需要的同学可以下载项目到本地,直接运行即可!
Java项目地址:https://download.csdn.net/download/NDF923/19060062
最终插入数据库中的数据如下:
三、导入到hive中
1、hive表创建sql
create table dim_date(
date_id String COMMENT '日期数字格式,如:20200101',
date_value String COMMENT '日期字符串格式,如:2020-01-01',
week_num String COMMENT '周几的数字,如:1,2,3,4,5,6,7',
week_cn String COMMENT '周几的中文表示,如:星期一',
year_week String COMMENT '年周组合,如:2020-34',
month_num String COMMENT '月份数字,如:1,2...12',
year_month_value String COMMENT '年月组合,如:2020-01',
quarter_num String COMMENT '季度数字,如:1,2,3,4',
year_quarter String COMMENT '年季度组合,如:2020-Q1',
year_num String COMMENT '年数字,如:2020',
day_in_year String COMMENT '一年中的第几天,如:365',
day_in_month String COMMENT '一月中的第几天,如:28',
is_first_day_in_month String COMMENT '是否是月的第一天(Y:是;N:否)',
is_last_day_in_month String COMMENT '是否是月的最后一天(Y:是;N:否)',
week_in_month String COMMENT '月的第几周,如:1',
week_in_year String COMMENT '年的第几周,如:30',
date_type String COMMENT '日期类型:工作日:WORKDAY;国家法定节假日:HOLIDAY;周末休息日:WEEKEND;节假日调休:EXCHANGING_HOLIDAY'
)
row format delimited
fields terminated by '\001'
lines terminated by '\n'
stored as textfile;
注:字段分隔一定要用 "\001",这样可以和mysql数据库的分隔符对应上
2、sqoop导入
sqoop import \
--connect jdbc:mysql://192.168.100.154:3306/test \
--username root \
--password a123456 \
--table dim_date \
--fields-terminated-by "\001" \
--hive-import \
--create-hive-table \
--hive-database dim \
--hive-table dim_date \
--delete-target-dir \
--hive-drop-import-delims \
-m 1
到此,任务完成!!!
-------------------------------------------------------------------------------------------华丽的分割线---------------------------------------------------------------------------------------------------------------------------------------
最近根据实际需求,比如每天更新统计数据时,要更新当周、当月、当季度的数据,因此新增了如下字段:
具体可下载项目,解压后导入idea即可,项目有也包含mysql和hive的所有SQL脚本;Java代码项目地址:https://download.csdn.net/download/NDF923/19060062