Spring boot 集成 Hive JDBC

最新推荐文章于 2024-08-07 19:25:35 发布

zh350229319

最新推荐文章于 2024-08-07 19:25:35 发布

阅读量3.6k

点赞数 1

分类专栏： hive

本文链接：https://blog.csdn.net/zh350229319/article/details/88695342

版权

hive 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

Spring boot 集成 Hive JDBC

标签（空格分隔）： hive

参考Spring boot with Apache Hive 实现
支持多数据源

YARN 配置

Hive JDBC 使用 Spark 引擎，使用 YARN 模式执行任务
YARN 使用 FAIR 调度，Hive 队列最少需要 2G 内存 + 2CPU，因此 Hive 队列只能执行 2 个任务，在后续Druid连接池配置中的最大连接数和此配置对应
fair-scheduler.xml配置如下：

<?xml version="1.0" encoding="UTF-8" ?>
<allocations>
  <queue name="default">
    <maxResources>9216 mb,9 vcores</maxResources>
    <maxRunningApps>1</maxRunningApps>
    <minResources>2048 mb,2 vcores</minResources>
    <schedulingPolicy>fair</schedulingPolicy>
    <weight>2</weight>
  </queue>
  <!-- 运行多个任务时，需要比最小资源多一点 -->
  <queue name="hive">
    <maxResources>4096 mb,4 vcores</maxResources>
    <maxRunningApps>2</maxRunningApps>
    <minResources>2048 mb,2 vcores</minResources>
    <schedulingPolicy>fair</schedulingPolicy>
    <weight>1</weight>
  </queue>
</allocations>

Spring 项目配置

pom.xml

因为公司项目使用logback作为日志插件，去掉log4j等相关引用
引入spring-boot-starter-jdbc和druid，连接词也可以使用tomcat-jdbc

<dependency>
    <groupId>org.apache.hive</groupId>
    <artifactId>hive-jdbc</artifactId>
    <version>2.3.2</version>
    <exclusions>
        <exclusion>
            <groupId>org.eclipse.jetty.aggregate</groupId>
            <artifactId>*</artifactId>
        </exclusion>
        <exclusion>
            <artifactId>log4j</artifactId>
            <groupId>log4j</groupId>
        </exclusion>
        <exclusion>
            <artifactId>log4j-1.2-api</artifactId>
            <groupId>org.apache.logging.log4j</groupId>
        </exclusion>
        <exclusion>
            <artifactId>log4j-api</artifactId>
            <groupId>org.apache.logging.log4j</groupId>
        </exclusion>
        <exclusion>
            <artifactId>log4j-core</artifactId>
            <groupId>org.apache.logging.log4j</groupId>
        </exclusion>
        <exclusion>
            <artifactId>log4j-slf4j-impl</artifactId>
            <groupId>org.apache.logging.log4j</groupId>
        </exclusion>
        <exclusion>
            <artifactId>slf4j-log4j12</artifactId>
            <groupId>org.slf4j</groupId>
        </exclusion>
        <exclusion>
            <artifactId>log4j-web</artifactId>
            <groupId>org.apache.logging.log4j</groupId>
        </exclusion>
    </exclusions>
</dependency>

<dependency>
    <groupId>org.springframework.boot</groupId>
    <artifactId>spring-boot-starter-jdbc</artifactId>
</dependency>
<dependency>
    <groupId>com.alibaba</groupId>
    <artifactId>druid</artifactId>
</dependency>

application.xml

hive:
  url: jdbc:hive2://dashuju214:10000,dashuju112:10000,dashuju113:10000/default
#  url: jdbc:hive2://dashuju214:10000/default
  driver-class-name: org.apache.hive.jdbc.HiveDriver
  type: com.alibaba.druid.pool.DruidDataSource
  user: hadoop
#  password: Pure@123
  # 配置获取连接等待超时的时间
  maxWait: 60000
  # 配置间隔多久才进行一次检测，检测需要关闭的空闲连接，单位是毫秒
  timeBetweenEvictionRunsMillis: 60000
  # 配置一个连接在池中最小生存的时间，单位是毫秒
  minEvictableIdleTimeMillis: 30000
  #使用count(1)可以检查hive中Spark Content是否存在
  validationQuery: select count(1)
  testWhileIdle: true
  # 开启后影响性能，可以防止hive丢失连接导致的查询失败
  testOnBorrow: true
  testOnReturn: false
  # 打开PSCache，并且指定每个连接上PSCache的大小
  poolPreparedStatements: true
  maxPoolPreparedStatementPerConnectionSize: 20
  hiveQueue:
    queue: hive
    # 下面为连接池的补充设置，应用到上面所有数据源中
    # 初始化大小，最小，最大，和yarn中的配置对应
    initialSize: 1
    minIdle: 1
    maxActive: 2

经过测试，testOnBorrow开启后，能够保证查询一定执行成功。

HiveQueueConfig.java

package com.bwjf.schedule.config;

import lombok.Getter;
import lombok.Setter;

@Getter
@Setter
public class HiveQueueConfig {
    private String queue;
    private int initialSize;
    private int minIdle;
    private int maxActive;
}

HiveDruidConfig.java

package com.bwjf.schedule.config;

import com.alibaba.druid.pool.DruidDataSource;
import lombok.Getter;
import lombok.Setter;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.jdbc.core.JdbcTemplate;

import javax.sql.DataSource;
import java.util.Properties;

@Getter
@Setter
@Configuration
@ConfigurationProperties(prefix = "hive")
public class HiveDruidConfig {

    private String url;
    private String user;
    private String password;
    private String driverClassName;
    private int maxWait;
    private int timeBetweenEvictionRunsMillis;
    private int minEvictableIdleTimeMillis;
    private String validationQuery;
    private boolean testWhileIdle;
    private boolean testOnBorrow;
    private boolean testOnReturn;
    private boolean poolPreparedStatements;
    private int maxPoolPreparedStatementPerConnectionSize;

    private HiveQueueConfig hiveQueue;

    @Bean(name = "hiveQueueDataSource")
    public DataSource hiveQueueDataSource() {
        return hiveDruidDatasource(hiveQueue);
    }

    private DruidDataSource hiveDruidDatasource(HiveQueueConfig hiveQueueConfig) {
        DruidDataSource datasource = new DruidDataSource();
        datasource.setUrl(url);
        datasource.setUsername(user);
        if (StringUtils.isNotEmpty(password)) {
            datasource.setPassword(password);
        }
        datasource.setDriverClassName(driverClassName);

        // pool configuration
        datasource.setInitialSize(hiveQueueConfig.getInitialSize());
        datasource.setMinIdle(hiveQueueConfig.getMinIdle());
        datasource.setMaxActive(hiveQueueConfig.getMaxActive());
        Properties properties = new Properties();
        properties.setProperty("hiveconf:spark.yarn.queue", hiveQueueConfig.getQueue());
        datasource.setConnectProperties(properties);

        datasource.setMaxWait(maxWait);
        datasource.setTimeBetweenEvictionRunsMillis(timeBetweenEvictionRunsMillis);
        datasource.setMinEvictableIdleTimeMillis(minEvictableIdleTimeMillis);
        datasource.setValidationQuery(validationQuery);
        datasource.setTestWhileIdle(testWhileIdle);
        datasource.setTestOnBorrow(testOnBorrow);
        datasource.setTestOnReturn(testOnReturn);
        datasource.setPoolPreparedStatements(poolPreparedStatements);
        datasource.setMaxPoolPreparedStatementPerConnectionSize(maxPoolPreparedStatementPerConnectionSize);
        return datasource;
    }

    @Bean(name = "hiveQueueTemplate")
    public JdbcTemplate hiveQueueTemplate(@Qualifier("hiveQueueDataSource") DataSource dataSource) {
        return new JdbcTemplate(dataSource);
    }

}

调用

    @Autowired
    @Qualifier("hiveQueueTemplate")
    private JdbcTemplate hiveQueueTemplate;

##遇到的问题

BeanPropertyRowMapper映射失败

因为默认hive查询返回的列名中包含表名导致映射失败，需要修改hive配置
Hue中hive（hive cli）中显示列名，不带表名

zh350229319

关注

1
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录