【hive】获取hive最新分区UDF函数

最新推荐文章于 2024-03-04 17:31:37 发布

浮云6363

最新推荐文章于 2024-03-04 17:31:37 发布

阅读量1.1k

点赞数

分类专栏： # hive 大数据相关

本文链接：https://blog.csdn.net/lz6363/article/details/108178317

版权

大数据相关同时被 2 个专栏收录

94 篇文章 1 订阅

订阅专栏

hive

52 篇文章 3 订阅

订阅专栏

在离线数仓中，会碰到事实表关联维表时只需要维表最新分区的数据关联即可，这时可以编写UDF函数获取表的最新分区，下面代码是通过遍历HDFS文件路径获取最新分区。

<repositories>
        <repository>
            <id>mvnrepository</id>
            <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
        </repository>
        <repository>
            <id>cloudera</id>
            <url>https://mvnrepository.com/artifact</url>
        </repository>
        <repository>
            <id>aliyun</id>
            <url>http://maven.aliyun.com/mvn/view</url>
        </repository>
        <repository>
            <id>jboss</id>
            <url>http://repository.jboss.com/nexus/content/groups/public</url>
        </repository>
    </repositories>
    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <maven.compiler.source>1.7</maven.compiler.source>
        <maven.compiler.target>1.7</maven.compiler.target>
        <hadoop.version>2.7.6</hadoop.version>
        <hive.version>2.1.1</hive.version>
    </properties>
    <dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop.version}</version>
            <exclusions>
                <exclusion>
                    <groupId>log4j</groupId>
                    <artifactId>log4j</artifactId>
                </exclusion>
                <exclusion>
                    <groupId>org.slf4j</groupId>
                    <artifactId>slf4j-log4j12</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-exec</artifactId>
            <version>${hive.version}</version>
        </dependency>
    </dependencies>

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;

import java.net.URI;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

public class GetNewestPatition extends UDF {

    public Text evaluate(Text tableName) {

        StringBuffer sb = new StringBuffer();
        String newestPatition = null;
        String split1 = tableName.toString().split("\\.")[0];
        String split2 = tableName.toString().split("\\.")[1];
        String fileName = sb.append("/user/hive/warehouse-3.1.1/").append(split1).append(".db/").append(split2).toString();
        try{
            newestPatition = getFileList(fileName);
        }catch (Exception e){
            System.out.println("获取结果异常" +e.getMessage());
        }
        ;
        return new Text(newestPatition);
        //return newestPatition;
    }

    public static String getFileList(String path) throws Exception{
        String res = null;

        Configuration conf=new Configuration(false);
        conf.set("fs.default.name", "hdfs://192.168.235.66:8020/");
        FileSystem hdfs = FileSystem.get(URI.create(path),conf);
        FileStatus[] fs = hdfs.listStatus(new Path(path));
        Path[] listPath = FileUtil.stat2Paths(fs);

        List<String> list = new ArrayList();
        for(Path p : listPath){
            String s = p.toString();
            String partition = s.split("=")[1];
            list.add(partition);
        }
        if(list.size() != 0) {
            res = Collections.max(list).toString();
        }
        return  res;
    }
}

然后用maven打包，上传到HDFS上

hdfs dfs -put hive-UDF-1.0.0-1.0-SNAPSHOT.jar /user/hive/udf/

登陆hive客户端

create function getnewest_partition as 'com.fuyun.udf.UDFLatestPartition' using jar 'hdfs:/user/hive/udf/hive-UDF-1.0.0-1.0-SNAPSHOT.jar';

select getnewest_partition('temp.temp_partition1_tb');

浮云6363

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
【hive】获取hive最新分区UDF函数

在离线数仓中，会碰到事实表关联维表时只需要维表最新分区的数据关联即可，这时可以编写UDF函数获取表的最新分区，下面代码是通过遍历HDFS文件路径获取最新分区。<repositories> <repository> <id>mvnrepository</id> <url>https://repository.cloudera.com/artifactory/cloudera-repos
复制链接

扫一扫

专栏目录