hive udf 获取分区的最大日期

5 篇文章 0 订阅

1,因业务需要,需要获取最新的分区时间,本例使用的是20191011格式的分区时间,如果是2019-10-11好像也支持,如

select maxpt('default.table1');

2,引用的 pom.xml如下

 

properties>
		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
		<hive.version>3.1.0</hive.version>
		<hadoop.version>3.1.0</hadoop.version>
	</properties>

	<dependencies>
		<dependency>
			<groupId>org.apache.hadoop</groupId>
			<artifactId>hadoop-client</artifactId>
			<version>${hadoop.version}</version>
			<exclusions>
				<exclusion>
					<groupId>log4j</groupId>
					<artifactId>log4j</artifactId>
				</exclusion>
				<exclusion>
					<groupId>org.slf4j</groupId>
					<artifactId>slf4j-log4j12</artifactId>
				</exclusion>
			</exclusions>
			<scope>provided</scope>
		</dependency>
		<dependency>
			<groupId>org.apache.hadoop</groupId>
			<artifactId>hadoop-common</artifactId>
			<version>${hadoop.version}</version>
			<scope>provided</scope>
		</dependency>
		<dependency>
			<groupId>org.apache.hive</groupId>
			<artifactId>hive-exec</artifactId>
			<version>${hive.version}</version>
			<scope>provided</scope>
		</dependency>
	</dependencies>

 

3,具体代码实现

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;

import java.net.URI;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

public final class MaxPtProd extends UDF {

    public Text evaluate(final Text s) {

        String str = s.toString();
        StringBuffer sb = new StringBuffer();
        List<String> list = new ArrayList();
        String ss = null;
        String newStr = str.toString().replace("'", "").replace("\"", "").replace("=", "");
        System.out.println(newStr);
        String split1 = newStr.split("\\.")[0];
        String split2 = newStr.split("\\.")[1];
        String fileName = sb.append("/warehouse/tablespace/managed/hive/").append(split1).append(".db/").append(split2).toString();
        System.out.println(fileName);

        try{
           ss = getFileList(fileName);
        }catch (Exception e){
            System.out.println("获取结果异常" +e.getMessage());
        }
        ;
        return new Text(ss);
    }

    // 参考配置 https://www.jianshu.com/p/ac75432d811b
    public static String getFileList(String path) throws Exception{
         String res = null;

        Configuration conf=new Configuration(false);
        String nameservices = "gtdata";
        String[] namenodesAddr = {"data02:8020","data03:8020"};
        String[] namenodes = {"nn1","nn2"};

        conf.set("fs.defaultFS", "hdfs://" + nameservices);
        conf.set("dfs.nameservices",nameservices);
        conf.set("dfs.ha.namenodes." + nameservices, namenodes[0]+","+namenodes[1]);
        conf.set("dfs.namenode.rpc-address." + nameservices + "." + namenodes[0], namenodesAddr[0]);
        conf.set("dfs.namenode.rpc-address." + nameservices + "." + namenodes[1], namenodesAddr[1]);
        conf.set("dfs.client.failover.proxy.provider." + nameservices
                ,"org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider");
        String hdfsRPCUrl = "hdfs://" + nameservices + ":" + 8020;

        FileSystem hdfs = FileSystem.get(URI.create(path),conf);
        FileStatus[] fs = hdfs.listStatus(new Path(path));
        Path[] listPath = FileUtil.stat2Paths(fs);

        List<Integer> list = new ArrayList();
        for(Path p : listPath){
            System.out.println(p);
            String s = p.toString();
//            s = "ttttttttttt";
            String number = s.substring(s.length()-8, s.length());
            try {
                Integer num = Integer.parseInt(number);
                list.add(num);
            }catch (Exception e){
                System.out.println("异常数据," + number);
            }
        }
        if(list.size() != 0) {
            res = Collections.max(list).toString();
        }
        return  res;
    }

}

4,也可以使用 hive的元数据mysql 的 tbls和 partitions两个表做管理,获取最新的分区

  • 2
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值