hive UDF函数取最新分区
1.pom文件
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.hive/hive-exec -->
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>1.2.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.6</version>
<exclusions>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.10</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.0</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.2</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*/RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>
2.代码
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.UDF;
import java.net.URI;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
* 该UDF函数取表最新分区,通过文件检索以降低能耗
* 参数 tableName :schema.table_name
* 返回 latesttPatition:最新分区名称
*/
public class latest_partition extends UDF {
public String evaluate(String tableName) {
StringBuffer sb = new StringBuffer();
String latesttPatition = null;
// 获取shema
String split1 = tableName.split("\\.")[0];
// 获取table_name
String split2 = tableName.split("\\.")[1];
// 拼接路径
String fileName = sb.append("/user/hive/warehouse/").append(split1).append(".db/").append(split2).toString();
try{
// 调用方法获取最新分区
latesttPatition = getFileList(fileName);
}catch (Exception e){
System.out.println("获取结果异常" +e.getMessage());
}
return latesttPatition;
}
// 获取最新分区
public static String getFileList(String path) throws Exception{
String res = null;
Configuration conf=new Configuration(false);
conf.set("fs.default.name", "hdfs://hacluster/");
FileSystem hdfs = FileSystem.get(URI.create(path),conf);
FileStatus[] fs = hdfs.listStatus(new Path(path));
Path[] listPath = FileUtil.stat2Paths(fs);
List<String> list = new ArrayList();
for(Path p : listPath){
String s = p.toString();
// hdfs上有可能有非分区文件,只处理分区文件
if(s.contains("=")) {
String partition = s.split("=")[1];
list.add(partition);
}
}
if(list.size() != 0) {
res = Collections.max(list).toString();
}
return res;
}
}
大表查询最新分区往往由于各种原因,可能需要几个小时,使用该函数可以实现秒级返回数据。性能可大范围提升。
-- 优化前sql查询语句(耗时特别久,全表扫描)
SELECT MAX(dt) as latest_dt FROM table_name;
-- 优化后(通过文件系统查询,数秒返回结果)
SELECT LST_DT('schema.table_name');