java操作hive获取数据上传hdfs并maven打包依赖本地jar 带源码

2 篇文章 0 订阅

背景

大数据平台配置可执行的jar包,需求是jar包能够获取hive表数据,并将数据上传到hdfs。

组件

jdk8 + hive + hdfs

源码

https://gitee.com/acelee723/acelee-hive-hdfs-main-jar

代码 

1.hive操作类

import org.mortbay.util.ajax.JSON;

import java.sql.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * Hive操作类
 *
 * @author Ace Lee
 * @date 2019/8/27 14:18
 * @version 1.0
 **/
public class HiveClientUtils {

    private static String driverName = "com.cloudera.hive.jdbc4.HS2Driver";

    //填写hive的IP,之前在配置文件中配置的IP
    private static String Url = "jdbc:hive2://10.10.10.10:10000/hbzfw";

    private static Connection conn;

    private static PreparedStatement ps;

    private static ResultSet rs;

    //创建连接
    public static Connection getConnnection() {
        try {
            Class.forName(driverName);
            //此处的用户名一定是有权限操作HDFS的用户,否则程序会提示"permission deny"异常
            conn = DriverManager.getConnection(Url, "", "");
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
            System.exit(1);
        } catch (SQLException e) {
            e.printStackTrace();
        }
        return conn;
    }

    public static PreparedStatement prepare(Connection conn, String sql) {
        PreparedStatement ps = null;
        try {
            ps = conn.prepareStatement(sql);
        } catch (SQLException e) {
            e.printStackTrace();
        }
        return ps;
    }

    public static List<Map<String,Object>> getResult(String sql) {
        List<Map<String,Object>> rowDatas = new ArrayList<Map<String, Object>>();
        System.out.println(sql);
        conn = getConnnection();

        try {
            ps = prepare(conn, sql);
            rs = ps.executeQuery();
            ResultSetMetaData md = rs.getMetaData();
            int columCount = md.getColumnCount();

            while(rs.next()) {
                Map<String,Object> rowData = new HashMap<String, Object>();
                for(int i = 1; i <= columCount; i++) {
                    rowData.put(md.getColumnName(i), rs.getObject(i));
                }
                rowDatas.add(rowData);
            }
            System.out.println(JSON.toString(rowDatas));
        } catch (SQLException e) {
            e.printStackTrace();
        }
        return rowDatas;
    }

}

2.hdfs操作类

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;

import java.io.*;
import java.net.URI;
import java.text.SimpleDateFormat;
import java.util.Date;

/**
 * Hdfs操作类
 *
 * @author Ace Lee
 * @date 2019/8/27 14:18
 * @version 1.0
 **/
public class HdfsFileSystem {

    /**
     * HDFS集群路径
     */
    private static final String hdfsPath = "hdfs://10.10.10.10:8020";

    public static void copyFileToHDFSByName(Configuration conf,String localFileName, String remoteFileName) throws IOException {
        FileSystem fs = FileSystem.get(URI.create(hdfsPath), conf);
        fs.copyFromLocalFile(new Path(localFileName), new Path(remoteFileName));
        System.out.println("copy from local file:" + localFileName + " to HDFS file:" + remoteFileName + " done.");
        fs.close();
    }

    public static void copyFileToHDFSByFileObj(byte[] bytes, String prefix, String fileName) throws IOException {
        InputStream in = null;
        try {
            Configuration conf = new Configuration();
            FileSystem fileSystem = FileSystem.get(URI.create(hdfsPath), conf);
            FSDataOutputStream out = fileSystem.create(new Path(buildPath(hdfsPath, "/document", prefix, fileName)));
            in = new ByteArrayInputStream(bytes);
            IOUtils.copyBytes(in, out, 4096, false);
            out.hsync();
            out.close();
        } finally {
            IOUtils.closeStream(in);
        }
        return;
    }

    public static void copyFileToHDFSByFileObj(String filename,String hdfsUri) throws IOException {
        if (null == hdfsUri || hdfsUri.isEmpty()) {
            System.err.println("copyFileToHDFSByFile: hdfsUri are required");
            return;
        }
        String localPath = HdfsFileSystem.class.getResource("").getPath();
        String localFile = localPath+filename;

        InputStream in = new FileInputStream(localFile);
        try {
            Configuration conf = new Configuration();
            FileSystem fileSystem = FileSystem.get(URI.create(hdfsPath), conf);
            FSDataOutputStream out = fileSystem.create(new Path(hdfsPath+hdfsUri+filename));
            IOUtils.copyBytes(in, out, 4096, false);
            out.hsync();
            out.close();
        } finally {
            IOUtils.closeStream(in);
        }
        return;
    }

    public static void copyFileToHDFSByFileObj(InputStream in, String prefix, String fileName) throws IOException {
        try {
            Configuration conf = new Configuration();
            FileSystem fileSystem = FileSystem.get(URI.create(hdfsPath), conf);
            FSDataOutputStream out = fileSystem.create(new Path(buildPath(hdfsPath, "/document", prefix, fileName)));
            IOUtils.copyBytes(in, out, 4096, false);
            out.hsync();
            out.close();
        } finally {
            IOUtils.closeStream(in);
        }
        return;
    }
    public static void copyFileToHDFSByFileObj(File localPath) throws IOException {
        InputStream in = null;
        if (null == localPath) {
            System.out.println("copyFileToHDFSByFile: localpath are required");
            return;
        }
        try {
            Configuration conf = new Configuration();
            FileSystem fileSystem = FileSystem.get(URI.create(hdfsPath), conf);
            FSDataOutputStream out = fileSystem.create(new Path(hdfsPath));

            in = new BufferedInputStream(new FileInputStream(localPath));
            IOUtils.copyBytes(in, out, 4096, false);
            out.hsync();
            out.close();
            //in.close();
        } finally {
            IOUtils.closeStream(in);
        }
        return;
    }

    /*
     * Download hdfs file in URI to local file
     */
    public static void downloadFromHDFS(Configuration conf, String uri, String remoteFileName, String localFileName) throws IOException {
        Path path = new Path(remoteFileName);
        FileSystem fs = FileSystem.get(URI.create(uri), conf);
        fs.copyToLocalFile(path, new Path(localFileName));
        fs.close();
        System.out.println("downloading file from " + remoteFileName + " to " + localFileName + " succeed");
        return;
    }


    /*
     * Download hdfs file in URI to local file
     */
    public static void downloadFromHDFS(String uri, String HDFSFileName, OutputStream localFileOutPut) throws IOException {
        Configuration config = new Configuration();
        FileSystem fs = FileSystem.get(URI.create(uri), config);
        InputStream is = fs.open(new Path(uri + "/" + HDFSFileName));
        IOUtils.copyBytes(is, localFileOutPut, 4096, true);//close in and out stream via this API itself.
        System.out.println("downloading HDFS file " + HDFSFileName + " succeed");
        fs.close();
        return;
    }

    public static InputStream downloadFromHDFS(String uri, String HDFSFileName) throws IOException {
        Configuration config = new Configuration();
        FileSystem fs = FileSystem.get(URI.create(uri), config);
        InputStream is = fs.open(new Path(uri + HDFSFileName));
        if (is == null) {
            System.out.println("hdfs inputStream is null");
        }
        return is;
    }

    /*
     * check whether the HDFS file exists in given URI
     */
    public static boolean exists(String HDFSUri, String HDFSFileName) {
        Configuration conf = new Configuration();
        boolean fileExists = false;
        try {
            FileSystem fileSystem = FileSystem.get(URI.create(HDFSUri), conf);
            fileExists = fileSystem.exists(new Path(HDFSUri + "/" + HDFSFileName));
        } catch (IOException e) {
            System.out.println("hdfs:exist() exception occurs. exception:" + e.getMessage());
            return fileExists;
        }

        System.out.println("HDFS URI:" + HDFSUri + ", fileName:" + HDFSFileName + " exists ? " + fileExists);
        return fileExists;
    }

    /**
     * 查看目录下面的文件
     *
     * @param uri
     * @param folder
     * @throws IOException
     */
    public static void ls(String uri, String folder) throws IOException {
        Configuration conf = new Configuration();
        Path path = new Path(folder);
        FileSystem fs = FileSystem.get(URI.create(uri), conf);
        FileStatus[] list = fs.listStatus(path);
        System.out.println("ls: " + folder);
        System.out.println("==========================================================");
        for (FileStatus f : list) {
            System.out.printf("name: %s, folder: %s, size: %d\n", f.getPath(), f.isDirectory(), f.getLen());
        }
        System.out.println("==========================================================");
        fs.close();
    }

    /**
     * 删除文件或者文件夹
     * @param uri
     * @param filePath
     * @throws IOException
     */
    public static void delete(String uri,String filePath) throws IOException {
        Configuration conf = new Configuration();
        Path path = new Path(filePath);
        FileSystem fs = FileSystem.get(URI.create(uri), conf);
        fs.deleteOnExit(path);
        System.out.println("Delete: " + filePath);
        fs.close();
    }


    public static String getCurrentDatePath(){
        return new SimpleDateFormat("yyyy/MM/dd/").format(new Date());
    }

    public static String getCurrentDateTime(){
        return new SimpleDateFormat("yyyyMMddHHmmssSSS").format(new Date());
    }

    public static String buildPath(String... paths){
        StringBuffer buffer=new StringBuffer(paths.length>0?paths[0]:"");
        if(paths!=null&&paths.length>=2){
            for(int i=1;i<paths.length;i++){
                if(paths[i]==null||paths[i].length()==0){
                    continue;
                }
                if(paths[i-1].endsWith("/")){
                    if(paths[i].startsWith("/")){
                        buffer.append(paths[i].substring(1));
                    }else{
                        buffer.append(paths[i]);
                    }
                }else{
                    if(paths[i].startsWith("/")){
                        buffer.append(paths[i]);
                    }else{
                        buffer.append("/").append(paths[i]);
                    }
                }

            }
        }
        return buffer.toString();
    }


    public static String writeFile(String data) throws Exception{
        String localPath = HdfsFileSystem.class.getResource("").getPath();
        String localFilename = getCurrentDateTime()+".txt";
        localPath+=localFilename;
        File file = new File(localPath);

        //if file doesnt exists, then create it
        if(!file.exists()){
            file.createNewFile();
        }

        FileWriter fileWritter = new FileWriter(file.getName());
        BufferedWriter bufferWritter = new BufferedWriter(fileWritter);
        bufferWritter.write(data);
        bufferWritter.close();
        fileWritter.close();
        System.out.println(localPath+" [write] done");

        return localFilename;
    }

    public static void deleteFile(String filename) throws Exception{
        String localPath = HdfsFileSystem.class.getResource("").getPath();
        File file = new File(localPath+filename);
        if(file.isFile() && file.exists()){
            boolean delete = file.delete();
            System.out.println(file.getPath()+" [delete] "+delete);
        }
    }
}

3.主程序入口类

  •  这里需要传参,直接在执行jar命令后添加参数,如java -jar xx.jar 参数1 参数2 ...
  • 打包时在pom文件里指定mainClass的路径即可
import org.apache.commons.collections.CollectionUtils;
import java.util.List;
import java.util.Map;

public class QueryDatasApplication {

    public static void main(String[] args) {
        //**********这里获取jar执行命令中的参数**************
        //比如java -jar XXX.jar "sql"
        String sql = args[0];

        try {
            //查询hive
//            String sql = "select name,id_card from hbzfw.t_user";
            List<Map<String, Object>> result = HiveClientUtils.getResult(sql);
            if (CollectionUtils.isEmpty(result)){
                System.out.println("--------------query hive null");
                return;
            }

            //结果输出到hdfs
            ///data/result/2019/08/27/
            String resUri = "/data/result/"+HdfsFileSystem.getCurrentDatePath();

            String content = JacksonUtil.writeValueAsString(result);
            ///生成本地文件
            String filename = HdfsFileSystem.writeFile(content);
            //上传hdfs
            HdfsFileSystem.copyFileToHDFSByFileObj(filename,resUri);
            System.out.println("--------------send data to hdfs success");
            //删除本地文件
            HdfsFileSystem.deleteFile(filename);

        } catch (Exception e) {
            e.printStackTrace();
        }
    }




}

 4.下面是依赖本地jar打包的Maven配置

pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.aceleeyy</groupId>
    <artifactId>acelee-hive-hdfs-main-jar</artifactId>
    <version>1.0-SNAPSHOT</version>

    <dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.7.2</version>
            <exclusions>
                <exclusion>
                    <artifactId>slf4j-log4j12</artifactId>
                    <groupId>org.slf4j</groupId>
                </exclusion>
            </exclusions>
        </dependency>
        <!-- https://mvnrepository.com/artifact/com.fasterxml.jackson.core/jackson-databind -->
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-databind</artifactId>
            <version>2.9.3</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/com.fasterxml.jackson.core/jackson-core -->
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-core</artifactId>
            <version>2.9.3</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/com.fasterxml.jackson.core/jackson-annotations -->
        <dependency>
            <groupId>com.fasterxml.jackson.core</groupId>
            <artifactId>jackson-annotations</artifactId>
            <version>2.9.3</version>
        </dependency>
        <dependency>
            <groupId>com.cloudera</groupId>
            <artifactId>HiveJDBC4</artifactId>
            <version>1.4</version>
            <scope>system</scope>
            <systemPath>${project.basedir}/src/lib/HiveJDBC4.jar</systemPath>
        </dependency>
    </dependencies>


    <build>
        <plugins>
            <!-- maven jar in main class and dependencies start -->
            <plugin>
                <artifactId>maven-assembly-plugin</artifactId>
                <version>3.0.0</version>
                <configuration>
                    <archive>
                        <manifest>
                            <mainClass>QueryDmDatasApplication</mainClass>
                        </manifest>
                    </archive>
                    <!--<descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>-->
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id> <!-- this is used for inheritance merges -->
                        <phase>package</phase> <!--  bind to the packaging phase  -->
                        <goals>
                            <goal>single</goal>
                        </goals>
                        <!-- 增加配置 -->
                        <configuration>
                            <!-- assembly.xml文件路径 -->
                            <descriptors>
                                <descriptor>src/assembly/assembly.xml</descriptor>
                            </descriptors>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
            <!--  maven jar in main class and dependencies end -->
        </plugins>
    </build>


</project>

assembly.xml

<assembly>
    <id>jar-with-dependencies</id>
    <formats>
        <format>jar</format>
    </formats>
    <includeBaseDirectory>false</includeBaseDirectory>
    <dependencySets>
        <!-- 默认的配置 -->
        <dependencySet>
            <outputDirectory>/</outputDirectory>
            <useProjectArtifact>true</useProjectArtifact>
            <unpack>true</unpack>
            <scope>runtime</scope>
        </dependencySet>
        <!-- 增加scope类型为system的配置 -->
        <dependencySet>
            <outputDirectory>/</outputDirectory>
            <useProjectArtifact>true</useProjectArtifact>
            <unpack>true</unpack>
            <scope>system</scope>
        </dependencySet>
    </dependencySets>
</assembly>

 

### 回答1: Hive是一个基于Hadoop的数据仓库工具,它提供了动态分区、Bucketing、压缩等高级特性,能够方便地支持大规模的数据仓库查询和分析。 Hive中的自定义函数(UDF)是扩展Hive功能的一种方法,它允许用户自定义自己的函数来处理Hive中的数据。下面我们来介绍一下Hive自定义函数的示例(demo)。 首先,我们需要在Hive中创建一个自定义函数。创建自定义函数的步骤如下: 1. 编写Java代码来实现自定义函数的逻辑。例如,我们可以编写一个函数来计算字符串的长度。 ``` package com.example; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; public class StringLengthUDF extends UDF { public int evaluate(Text str) { if (str == null) { return 0; } return str.toString().length(); } } ``` 2. 将Java代码编译为Jar文件,并将其上传Hive所在的服务器上。 3. 在Hive中创建一个函数,指定函数的名称和Jar文件的路径。 ``` CREATE FUNCTION string_length AS 'com.example.StringLengthUDF' USING JAR 'hdfs://path/to/jar/file.jar'; ``` 现在我们已经成功地创建了一个自定义函数。我们可以在Hive中使用这个函数来处理数据。 例如,我们可以使用这个自定义函数来计算字符串的长度: ``` SELECT name, string_length(description) AS length FROM my_table; ``` 以上示例中,我们调用了自定义函数"string_length"来计算"my_table"表中"description"列的长度,并将结果别名为"length"。 通过自定义函数,我们可以方便地实现各种复杂的数据处理逻辑,从而更加灵活和高效地使用Hive进行数据分析。 ### 回答2: Hive是一个基于Hadoop的数据仓库工具,它提供了一个SQL接口来查询和分析大规模数据。Hive支持自定义函数以满足特定业务需求。 编写Hive自定义函数的示例,步骤如下: 1. 创建一个Java类,命名为CustomFunction,该类需要继承Hive UDF(User Defined Function)基类。 2. 实现Hive UDF需要重写evaluate方法,该方法用于处理输入参数并返回计算结果。例如,我们可以实现一个自定义函数来计算两个整数之和: ``` import org.apache.hadoop.hive.ql.exec.UDF; public class CustomFunction extends UDF { public int evaluate(int a, int b) { return a + b; } } ``` 3. 使用Maven或其他构建工具将Java打包JAR文件。 4. 将JAR文件上传Hive所在的机器上(例如Hadoop集群的某个节点)。 5. 在Hive中加载自定义函数: ``` ADD JAR /path/to/CustomFunction.jar; CREATE TEMPORARY FUNCTION custom_sum AS 'com.example.CustomFunction'; ``` 这里的`/path/to/CustomFunction.jar`是JAR文件的路径,`com.example.CustomFunction`是自定义函数的包和类名。 6. 使用自定义函数进行计算: ``` SELECT custom_sum(2, 3); ``` 这将返回计算结果,即5。 通过编写自定义函数,我们可以在Hive中实现更复杂的业务逻辑。在实际应用中,还可以通过参数的类型和数量的不同实现更多种类的自定义函数,以满足具体的数据处理需求。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值