1.客户需求
根据每月工作日并结合国家法定节假日计算工作量(必须给我实现,我不管你们用什么方法)
2.现状
由hive默认函数不支持该需求,那只好自己动手来写了,在不可能也要创造可能啊,毕竟客户god嘛
3.方法
hive组件支持用户自定义的UDFS函数,根据规约来实现吧
3.1.创建Maven工程
pom.xml如下
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>hive-udf</artifactId>
<version>1.0-SNAPSHOT</version>
<name>hive-udf</name>
<!-- FIXME change it to the project's website -->
<url>http://www.example.com</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>3.1.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hive/hive-exec -->
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>3.1.1</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
<plugins>
<!-- clean lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#clean_Lifecycle -->
<plugin>
<artifactId>maven-clean-plugin</artifactId>
<version>3.1.0</version>
</plugin>
<!-- default lifecycle, jar packaging: see https://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.8.0</version>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.22.1</version>
</plugin>
<plugin>
<artifactId>maven-jar-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-install-plugin</artifactId>
<version>2.5.2</version>
</plugin>
<plugin>
<artifactId>maven-deploy-plugin</artifactId>
<version>2.8.2</version>
</plugin>
<!-- site lifecycle, see https://maven.apache.org/ref/current/maven-core/lifecycles.html#site_Lifecycle -->
<plugin>
<artifactId>maven-site-plugin</artifactId>
<version>3.7.1</version>
</plugin>
<plugin>
<artifactId>maven-project-info-reports-plugin</artifactId>
<version>3.0.0</version>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>
3.2.创建自定义UDFS
首先,你必须创建一个新的类并继承UDF,它包含一个或多个函数方法,具体如下:
WorkDay类
package org.example;
import org.apache.hadoop.hive.ql.exec.UDF;
import java.io.*;
import java.text.SimpleDateFormat;
import java.util.*;
public final class WorkDay extends UDF {
// 返回工作日集合,只排除周末
public static List<Date> getWorkDays(int year, int month){
// 用于储存每月工作日
List<Date> dates = new ArrayList<Date>();
Calendar cal = Calendar.getInstance();
//设置月份
cal.set(Calendar.YEAR, year);
// 设置月份
cal.set(Calendar.MONTH, month - 1);
// 设置为当月第一天
cal.set(Calendar.DATE, 1);
while(cal.get(Calendar.YEAR) == year && cal.get(Calendar.MONTH) < month){
// 判断当前天为本周的第几天
int day = cal.get(Calendar.DAY_OF_WEEK);
// 如果不为周六或者周天,将日期进行储存
if(!(day == Calendar.SUNDAY || day == Calendar.SATURDAY)){
dates.add((Date)cal.getTime().clone());
}
// 将当前日期增加一天
cal.add(Calendar.DATE, 1);
}
// 返回当前月工作日集合
return dates;
}
/**
*
* @param dateList
* @return 返回日期字符串集合
*/
public static List<String> getDateString(List<Date> dateList){
// 储存日期字符串
List<String> dateString = new ArrayList<>();
SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd");
for (Date date : dateList){
String date2 = simpleDateFormat.format(date);
dateString.add(date2);
}
return dateString;
}
public static String evaluate(int year, int month) {
// 获取工作日集合(排除周末)
List<Date> workDays = getWorkDays(year,month);
// 获取工作日字符串集合
List<String> dateString = getDateString(workDays);
// 获取法定节假日集合
List<String> holidays = getHolidays();
// 从工作日中除去法定节假日
for(String item:holidays){
if(item.indexOf('+')!=-1){
if(!dateString.contains(item.substring(1,item.length()))&&Integer.valueOf(item.substring(6,8))==month){
dateString.add(item.substring(1,item.length()));
}
}else{
dateString.remove(item.substring(1,item.length()));
}
}
//dateString.removeAll(holidays);
System.out.println(dateString.size());
System.out.println("Work Day! " + dateString.size());
return String.valueOf(dateString.size());
}
public static void main(String[] args) {
System.out.println(evaluate(2020,05));
}
/**
*
* @return 返回法定节假日集合
*/
public static List<String> getHolidays(){
return ReadHdfsTxt.readTxtFile("/lib/Holidays.txt");
//return ReadTxt.readTxtFile("d:\\Holidays.txt");
}
}
Timeiter 类
package org.example;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
public class Timeiter {
// public static void main(String[] args) {
// System.out.println(getDays("2018-06-28", "2018-07-3"));
// }
/**
* 获取两个日期之间的所有日期
*
* @param startTime
* 开始日期
* @param endTime
* 结束日期
* @return
*/
public static List<String> getDays(String startTime, String endTime) {
// 返回的日期集合
List<String> days = new ArrayList<String>();
DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd");
try {
Date start = dateFormat.parse(startTime);
Date end = dateFormat.parse(endTime);
Calendar tempStart = Calendar.getInstance();
tempStart.setTime(start);
Calendar tempEnd = Calendar.getInstance();
tempEnd.setTime(end);
tempEnd.add(Calendar.DATE, +1);// 日期加1(包含结束)
while (tempStart.before(tempEnd)) {
days.add(dateFormat.format(tempStart.getTime()));
tempStart.add(Calendar.DAY_OF_YEAR, 1);
}
} catch (ParseException e) {
e.printStackTrace();
}
return days;
}
}
ReadHdfsTxt 类
package org.example;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
/**
* @author : jizhuang.wang
* @version V1.0
* @Project: hive-udf
* @Package org.example
* @Description: TODO
* @date Date : 20201218 14:44
*/
public class ReadHdfsTxt {
/**
* 读取文件内容
*/
public static List<String> readTxtFile(String remoteFilePath) {
Configuration conf = new Configuration();
conf.set("fs.default.name", "hdfs://node1.bankbigdata.cn:8020");
if(StringUtils.isEmpty(remoteFilePath)){
remoteFilePath = "/lib/Holidays.txt"; // HDFS路径
}
List<String> holidays = new ArrayList<>();
try {
FileSystem fs = FileSystem.get(conf);
Path remotePath = new Path(remoteFilePath);
FSDataInputStream in = fs.open(remotePath);
BufferedReader d = new BufferedReader(new InputStreamReader(in));
String lineTxt = null;
while ((lineTxt = d.readLine()) != null) {
holidays.add(lineTxt);
}
d.close();
in.close();
fs.close();
} catch (IOException e) {
e.printStackTrace();
}
return holidays;
}
/**
* 主函数
*/
public static void main(String[] args) {
Configuration conf = new Configuration();
conf.set("fs.default.name", "hdfs://node1.bankbigdata.cn:8020");
String remoteFilePath = "/lib/Holidays.txt"; // HDFS路径
try {
System.out.println("读取文件: " + remoteFilePath);
ReadHdfsTxt.readTxtFile(remoteFilePath);
System.out.println("\n读取完成");
} catch (Exception e) {
e.printStackTrace();
}
}
}
ReadTxt类
package org.example;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
/**
* @author : jizhuang.wang
* @version V1.0
* @Project: hive-udf
* @Package org.example
* @Description: TODO
* @date Date : 20201218 12:28
*/
public class ReadTxt {
/**
* 功能:Java读取txt文件的内容
* 步骤:1:先获得文件句柄
* 2:获得文件句柄当做是输入一个字节码流,需要对这个输入流进行读取
* 3:读取到输入流后,需要读取生成字节流
* 4:一行一行的输出。readline()。
* 备注:需要考虑的是异常情况
*
* @param filePath
*/
public static List<String> readTxtFile(String filePath) {
List<String> holidays = new ArrayList<>();
try {
String encoding = "GBK";
File file = new File(filePath);
if (file.isFile() && file.exists()) { //判断文件是否存在
InputStreamReader read = new InputStreamReader(
new FileInputStream(file), encoding);//考虑到编码格式
BufferedReader bufferedReader = new BufferedReader(read);
String lineTxt = null;
while ((lineTxt = bufferedReader.readLine()) != null) {
//System.out.println(lineTxt);
holidays.add(lineTxt);
}
read.close();
} else {
System.out.println("找不到指定的文件");
}
} catch (Exception e) {
System.out.println("读取文件内容出错");
e.printStackTrace();
}
return holidays;
}
}
3.3 打包工程
maven->clean->install
target->xxxx.jar包
4.上传调用验证
4.1.上传jar到hdfs
上传到hdfs中编译分布式调用
Holidays.txt内容
+2020-01-26
-2020-01-02
-2020-01-03
-2020-03-05
+2020-12-19
+2020-12-26
4.2.创建临时函数
打开hive可视化工具dbeaver
- –添加jar
add jar hdfs://node1.bankbigdata.cn:8020/lib/hive-udf-1.1-SNAPSHOT.jar;
add jar hdfs://node1.bankbigdata.cn:8020/lib/Holidays.txt;
- –创建临时函数
CREATE TEMPORARY FUNCTION work_hdfs_days AS 'org.example.WorkDay';
- –调用函数
SELECT work_hdfs_days(2020,12)
调用结果如下所示:
到此完成!