目录
Presto-udf开源项目地址:
https://github.com/qubole/presto-udfs
presto推荐jdk版本:Zulu OpenJDK11,相比较openjdk,oracle jdk,jvm占用内存更少。Github的版本也是基于Zulu OpenJDK开发。
Zulu OpenJDK11下载地址:
presto官方文档:
https://prestosql.io/docs/current/index.html
presto function文档:
https://prestosql.io/docs/current/develop/functions.html
Presto函数大致有三种:
1.scalar函数
标量函数,类似于Java中的静态方法,无状态
scalar函数常用注解
@Description() //函数的描述
@ScalarFunction()// 函数名称
@SqlType() //指定数据类型,修饰方法或参数
@SqlNullable //允许为null 修饰方法或参数
最近在将hive里面的自定义udf函数转换成presto里面的自定义udf。
举例:有如下需求
说明:该函数的作用是将输入格式的日期转化为输出格式的日期。
写法:dateformat(输入日期字段值, 输入日期格式, 输出日期格式)
hive->presto
POST格式表示某个日期到1900年1月1日的天数
UNIX格式表示某个时间到1970年1月1日0点0分0秒纪元(UTC)经过的秒数
需求:
1、输入格式POST、UNIX类型,输出yyyyMMdd格式
SELECT UDF_DATEFORMAT(42351,'POST','yyyyMMdd'); --输出结果:20151214
SELECT UDF_DATEFORMAT(1503417600,'UNIX','yyyyMMdd'); --输出结果:20170823
select udf_dateformat(1503417600,'UNIX','yyyyMMddHHmmss');
2、输入格式DATE、TIMESTAMP类型,输出yyyyMMdd格式
SELECT UDF_DATEFORMAT('2015-12-30 12:12:12','DATE','yyyyMMdd'); --输出结果:20151230
SELECT UDF_DATEFORMAT('2015-12-30 12:12:12.203','TIMESTAMP','yyyyMMdd'); --输出结果:20151230
3、输入格式yyyyMMdd、yyyy-MM-dd类型,输出POST,UNIX格式
SELECT UDF_DATEFORMAT('20151214','yyyyMMdd','POST'); --输出结果:42351
SELECT UDF_DATEFORMAT('2017-08-23','yyyy-MM-dd','UNIX'); --输出结果:1503417600
select udf_dateformat(1503417600,'UNIX','yyyyMMddHHmmss');
等同于(输出结果一样)hive中:select from_unixtime(1503417600,'yyyyMMdd'); --输出结果:20170823
代码实现:
import io.airlift.slice.Slice;
import io.prestosql.spi.function.Description;
import io.prestosql.spi.function.ScalarFunction;
import io.prestosql.spi.function.SqlType;
import io.prestosql.spi.type.StandardTypes;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import static io.airlift.slice.Slices.utf8Slice;
public class UdfDateformat {
private static final String POST_BASE_DAY = "19000101";//hive的post日期以19000101为基准
private static final String JAVA_BASE_DAY = "19700101";//Java日期以19700101为基准
private static final long DAY_OF_TIMESTAMP = 86400000;//一天的毫秒值
private static final String INPUT_TYPE_POST = "POST";
private static final String INPUT_TYPE_UNIX = "UNIX";
private static final String INPUT_TYPE_DATE = "DATE";
private static final String INPUT_TYPE_TIMESTAMP = "TIMESTAMP";
private static final String OUTPUT_TYPE_POST = "POST";
private static final String OUTPUT_TYPE_UNIX = "UNIX";
private static final String FORMAT_TYPE_YMDHMS = "yyyyMMddHHmmss";
private static final String FORMAT_TYPE_YMD = "yyyyMMdd";
private static final String FORMAT_TYPE_Y_M_D = "yyyy-MM-dd";
@Description("converts the date of the input format to the date of the output format")
@ScalarFunction("udf_dateformat")
@SqlType(StandardTypes.VARCHAR)
public static Slice evaluate(@SqlType(StandardTypes.VARCHAR) Slice dateVal, @SqlType(StandardTypes.VARCHAR) Slice inputFmtStr, @SqlType(StandardTypes.VARCHAR) Slice outputFmtStr){
String returnDt = "";
if ((dateVal == null) || (inputFmtStr == null) || (outputFmtStr == null)) {
return null;
}
if (("".equals(dateVal.toStringUtf8()) || ("".equals(inputFmtStr.toStringUtf8())) || ("".equals(outputFmtStr.toStringUtf8())))) {
return null;
}
if(INPUT_TYPE_DATE.equals(inputFmtStr.toStringUtf8()) || INPUT_TYPE_TIMESTAMP.equals(inputFmtStr.toStringUtf8())){
returnDt = InputFormatDateOrTimestamp(dateVal.toStringUtf8(),inputFmtStr.toStringUtf8(),outputFmtStr.toStringUtf8());
}else if(OUTPUT_TYPE_POST.equals(outputFmtStr.toStringUtf8()) || OUTPUT_TYPE_UNIX.equals(outputFmtStr.toStringUtf8())){
returnDt = OutputFormatPostOrUnix(dateVal.toStringUtf8(),inputFmtStr.toStringUtf8(),outputFmtStr.toStringUtf8());
}
return utf8Slice(returnDt);
}
@Description("converts the date of the input format to the date of the output format")
@ScalarFunction("udf_dateformat")
@SqlType(StandardTypes.VARCHAR)
public static Slice evaluate(@SqlType(StandardTypes.BIGINT) long dateVal, @SqlType(StandardTypes.VARCHAR) Slice inputFmtStr, @SqlType(StandardTypes.VARCHAR) Slice outputFmtStr){
String returnDt;
if ((inputFmtStr == null) || (outputFmtStr == null)) {
return null;
}
if (("".equals(inputFmtStr.toStringUtf8())) || ("".equals(outputFmtStr.toStringUtf8()))) {
return null;
}
returnDt = InputFormatPostOrUnix((int) dateVal,inputFmtStr.toStringUtf8(),outputFmtStr.toStringUtf8());
if(returnDt != null){
return utf8Slice(returnDt);
}
return null;
}
private static String InputFormatPostOrUnix(int dateVal, String inputFmtStr,String outputFmtStr) {
String returnDt = "";
try {
SimpleDateFormat formatYMD = new SimpleDateFormat(FORMAT_TYPE_YMD);//默认格式方式
SimpleDateFormat format = new SimpleDateFormat(outputFmtStr);//输出日期格式
Date date;
if ((INPUT_TYPE_POST.equals(inputFmtStr))) {
date = formatYMD.parse(POST_BASE_DAY);
Calendar ca = Calendar.getInstance();
ca.setTime(date);
ca.add(Calendar.DAY_OF_YEAR, dateVal);
date = ca.getTime();
if(date.after(formatYMD.parse(JAVA_BASE_DAY))){
date.setTime(date.getTime() - DAY_OF_TIMESTAMP);
}
//输入默认按照yyyyMMdd格式化 若输入格式为其他则按照对应方式格式化输入日期值
if(!(FORMAT_TYPE_YMD.equals(outputFmtStr))){
returnDt = format.format(date);
}else{
returnDt = formatYMD.format(date);
}
}else if ((INPUT_TYPE_UNIX.equals(inputFmtStr))){
if(FORMAT_TYPE_YMDHMS.equals(outputFmtStr)){
format = new SimpleDateFormat(FORMAT_TYPE_YMD);
}
date = new Date(Long.parseLong(dateVal+"000"));//秒转换为毫秒
returnDt = format.format(date);
}else {
return "";
}
} catch (ParseException parseException) {
System.out.println(parseException.getMessage());
}
return returnDt;
}
private static String OutputFormatPostOrUnix(String dateVal, String inputFmtStr, String outputFmtStr) {
String returnDt = "";
try {
SimpleDateFormat formatYMD = new SimpleDateFormat(FORMAT_TYPE_YMD);
SimpleDateFormat format = new SimpleDateFormat(inputFmtStr);
Date date ;
if ((OUTPUT_TYPE_POST.equals(outputFmtStr))) {
//输入默认按照yyyyMMdd格式化 若输入格式为其他则按照对应方式格式化输入日期值
if(!(FORMAT_TYPE_YMD.equals(inputFmtStr))){
date = format.parse(dateVal);
}else{
date = formatYMD.parse(dateVal);
}
Date basedate = formatYMD.parse(POST_BASE_DAY);
if(date.after(formatYMD.parse(JAVA_BASE_DAY))){
date.setTime(date.getTime() + DAY_OF_TIMESTAMP);
}
returnDt = String.valueOf((date.getTime() - basedate.getTime()) / DAY_OF_TIMESTAMP);
}else if ((OUTPUT_TYPE_UNIX.equals(outputFmtStr))){
date = format.parse(dateVal);
returnDt =String.valueOf(date.getTime()/1000);//毫秒转换为秒
}
} catch (ParseException parseException) {
System.out.println(parseException.getMessage());
}
return returnDt;
}
private static String InputFormatDateOrTimestamp(String dateVal, String inputFmtStr, String outputFmtStr) {
String returnDt = "";
if (UdfDateformat.INPUT_TYPE_DATE.equals(inputFmtStr) || UdfDateformat.INPUT_TYPE_TIMESTAMP.equals(inputFmtStr)) {
try {
SimpleDateFormat formatY_M_D = new SimpleDateFormat(FORMAT_TYPE_Y_M_D);
SimpleDateFormat format = new SimpleDateFormat(outputFmtStr);
Date date = formatY_M_D.parse(dateVal);
returnDt = format.format(date);
} catch (ParseException e) {
System.out.println(e.getMessage());
}
}
return returnDt;
}
}
插件注册:
public class UdfPlugin implements Plugin
{
@Override
public Set<Class<?>> getFunctions()
{
return ImmutableSet.<Class<?>>builder()
.add(UdfDateformat.class)
.build();
}
}
加载plugin资源 :
在目录src/main/resources/META-INF/services 中新建名为 com.facebook.presto.spi.Plugin 的文本文件,内容为插件注册类的全类名,如com.xxxx.xxx.xxx.UdfPlugin
pom文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<!-- if this lives as a subdirectory of your own project, then you'll need to do do the following workaround:
1. Create a file: src/main/resources/META-INF/services/io.prestosql.spi.Plugin
2. Put the value presto.udfs.UdfPlugin in it
-->
<properties>
<presto.version>346</presto.version>
<slice.version>0.36</slice.version>
<guava.version>26.0-jre</guava.version>
<junit.version>4.12</junit.version>
<javax.inject.version>1</javax.inject.version>
<io.airlift.log.version>0.183</io.airlift.log.version>
<jodatime.version>2.8.1</jodatime.version>
<hive.version>2.3.2</hive.version>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-release-plugin</artifactId>
<version>2.5.1</version>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<artifactSet>
<!-- dont need to add airlift.slice, presto provides it to all plugins -->
<includes>
<include>com.google.guava:guava</include>
<include>io.airlift:log</include>
<include>joda-time:joda-time</include>
</includes>
</artifactSet>
<relocations>
<relocation>
<pattern>org.joda</pattern>
<shadedPattern>org.joda.shaded</shadedPattern>
</relocation>
<relocation>
<pattern>com.google</pattern>
<shadedPattern>com.google.shaded</shadedPattern>
</relocation>
<relocation>
<pattern>io.airlift.log</pattern>
<shadedPattern>io.airlift.log.shaded</shadedPattern>
</relocation>
</relocations>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
<groupId>com.cplc.presto</groupId>
<artifactId>udfs</artifactId>
<version>3.0.0</version>
<name>PrestoUDFs</name>
<description>Common Functions for the Facebook Presto SQL Engine</description>
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.hive/hive-exec -->
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>${hive.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>${junit.version}</version>
</dependency>
<dependency>
<groupId>io.prestosql</groupId>
<artifactId>presto-spi</artifactId>
<version>${presto.version}</version>
</dependency>
<dependency>
<groupId>io.prestosql</groupId>
<artifactId>presto-main</artifactId>
<version>${presto.version}</version>
</dependency>
<dependency>
<groupId>io.airlift</groupId>
<artifactId>slice</artifactId>
<version>${slice.version}</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>${guava.version}</version>
</dependency>
<dependency>
<groupId>javax.inject</groupId>
<artifactId>javax.inject</artifactId>
<version>${javax.inject.version}</version>
</dependency>
<dependency>
<groupId>io.airlift</groupId>
<artifactId>log</artifactId>
<version>${io.airlift.log.version}</version>
</dependency>
<dependency>
<groupId>joda-time</groupId>
<artifactId>joda-time</artifactId>
<version>${jodatime.version}</version>
</dependency>
</dependencies>
</project>
maven打包项目放到集群就可以跑起来了。
2.aggregation函数
聚合函数,如avg函数,count函数,它是一个需要累积状态的函数,开发的难点在于维护状态
3.window函数
窗口函数