Spring boot 集成 Hive JDBC

Spring boot 集成 Hive JDBC

标签(空格分隔): hive


参考Spring boot with Apache Hive 实现
支持多数据源

YARN 配置

Hive JDBC 使用 Spark 引擎,使用 YARN 模式执行任务
YARN 使用 FAIR 调度,Hive 队列最少需要 2G 内存 + 2CPU,因此 Hive 队列只能执行 2 个任务,在后续Druid连接池配置中的最大连接数和此配置对应
fair-scheduler.xml配置如下:

<?xml version="1.0" encoding="UTF-8" ?>
<allocations>
  <queue name="default">
    <maxResources>9216 mb,9 vcores</maxResources>
    <maxRunningApps>1</maxRunningApps>
    <minResources>2048 mb,2 vcores</minResources>
    <schedulingPolicy>fair</schedulingPolicy>
    <weight>2</weight>
  </queue>
  <!-- 运行多个任务时,需要比最小资源多一点 -->
  <queue name="hive">
    <maxResources>4096 mb,4 vcores</maxResources>
    <maxRunningApps>2</maxRunningApps>
    <minResources>2048 mb,2 vcores</minResources>
    <schedulingPolicy>fair</schedulingPolicy>
    <weight>1</weight>
  </queue>
</allocations>

Spring 项目配置

pom.xml

因为公司项目使用logback作为日志插件,去掉log4j等相关引用
引入spring-boot-starter-jdbc和druid,连接词也可以使用tomcat-jdbc

<dependency>
    <groupId>org.apache.hive</groupId>
    <artifactId>hive-jdbc</artifactId>
    <version>2.3.2</version>
    <exclusions>
        <exclusion>
            <groupId>org.eclipse.jetty.aggregate</groupId>
            <artifactId>*</artifactId>
        </exclusion>
        <exclusion>
            <artifactId>log4j</artifactId>
            <groupId>log4j</groupId>
        </exclusion>
        <exclusion>
            <artifactId>log4j-1.2-api</artifactId>
            <groupId>org.apache.logging.log4j</groupId>
        </exclusion>
        <exclusion>
            <artifactId>log4j-api</artifactId>
            <groupId>org.apache.logging.log4j</groupId>
        </exclusion>
        <exclusion>
            <artifactId>log4j-core</artifactId>
            <groupId>org.apache.logging.log4j</groupId>
        </exclusion>
        <exclusion>
            <artifactId>log4j-slf4j-impl</artifactId>
            <groupId>org.apache.logging.log4j</groupId>
        </exclusion>
        <exclusion>
            <artifactId>slf4j-log4j12</artifactId>
            <groupId>org.slf4j</groupId>
        </exclusion>
        <exclusion>
            <artifactId>log4j-web</artifactId>
            <groupId>org.apache.logging.log4j</groupId>
        </exclusion>
    </exclusions>
</dependency>

<dependency>
    <groupId>org.springframework.boot</groupId>
    <artifactId>spring-boot-starter-jdbc</artifactId>
</dependency>
<dependency>
    <groupId>com.alibaba</groupId>
    <artifactId>druid</artifactId>
</dependency>

application.xml

hive:
  url: jdbc:hive2://dashuju214:10000,dashuju112:10000,dashuju113:10000/default
#  url: jdbc:hive2://dashuju214:10000/default
  driver-class-name: org.apache.hive.jdbc.HiveDriver
  type: com.alibaba.druid.pool.DruidDataSource
  user: hadoop
#  password: Pure@123
  # 配置获取连接等待超时的时间
  maxWait: 60000
  # 配置间隔多久才进行一次检测,检测需要关闭的空闲连接,单位是毫秒
  timeBetweenEvictionRunsMillis: 60000
  # 配置一个连接在池中最小生存的时间,单位是毫秒
  minEvictableIdleTimeMillis: 30000
  #使用count(1)可以检查hive中Spark Content是否存在
  validationQuery: select count(1)
  testWhileIdle: true
  # 开启后影响性能,可以防止hive丢失连接导致的查询失败
  testOnBorrow: true
  testOnReturn: false
  # 打开PSCache,并且指定每个连接上PSCache的大小
  poolPreparedStatements: true
  maxPoolPreparedStatementPerConnectionSize: 20
  hiveQueue:
    queue: hive
    # 下面为连接池的补充设置,应用到上面所有数据源中
    # 初始化大小,最小,最大,和yarn中的配置对应
    initialSize: 1
    minIdle: 1
    maxActive: 2

经过测试,testOnBorrow开启后,能够保证查询一定执行成功。

HiveQueueConfig.java

package com.bwjf.schedule.config;

import lombok.Getter;
import lombok.Setter;

@Getter
@Setter
public class HiveQueueConfig {
    private String queue;
    private int initialSize;
    private int minIdle;
    private int maxActive;
}

HiveDruidConfig.java

package com.bwjf.schedule.config;

import com.alibaba.druid.pool.DruidDataSource;
import lombok.Getter;
import lombok.Setter;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.jdbc.core.JdbcTemplate;

import javax.sql.DataSource;
import java.util.Properties;

@Getter
@Setter
@Configuration
@ConfigurationProperties(prefix = "hive")
public class HiveDruidConfig {

    private String url;
    private String user;
    private String password;
    private String driverClassName;
    private int maxWait;
    private int timeBetweenEvictionRunsMillis;
    private int minEvictableIdleTimeMillis;
    private String validationQuery;
    private boolean testWhileIdle;
    private boolean testOnBorrow;
    private boolean testOnReturn;
    private boolean poolPreparedStatements;
    private int maxPoolPreparedStatementPerConnectionSize;

    private HiveQueueConfig hiveQueue;

    @Bean(name = "hiveQueueDataSource")
    public DataSource hiveQueueDataSource() {
        return hiveDruidDatasource(hiveQueue);
    }

    private DruidDataSource hiveDruidDatasource(HiveQueueConfig hiveQueueConfig) {
        DruidDataSource datasource = new DruidDataSource();
        datasource.setUrl(url);
        datasource.setUsername(user);
        if (StringUtils.isNotEmpty(password)) {
            datasource.setPassword(password);
        }
        datasource.setDriverClassName(driverClassName);

        // pool configuration
        datasource.setInitialSize(hiveQueueConfig.getInitialSize());
        datasource.setMinIdle(hiveQueueConfig.getMinIdle());
        datasource.setMaxActive(hiveQueueConfig.getMaxActive());
        Properties properties = new Properties();
        properties.setProperty("hiveconf:spark.yarn.queue", hiveQueueConfig.getQueue());
        datasource.setConnectProperties(properties);

        datasource.setMaxWait(maxWait);
        datasource.setTimeBetweenEvictionRunsMillis(timeBetweenEvictionRunsMillis);
        datasource.setMinEvictableIdleTimeMillis(minEvictableIdleTimeMillis);
        datasource.setValidationQuery(validationQuery);
        datasource.setTestWhileIdle(testWhileIdle);
        datasource.setTestOnBorrow(testOnBorrow);
        datasource.setTestOnReturn(testOnReturn);
        datasource.setPoolPreparedStatements(poolPreparedStatements);
        datasource.setMaxPoolPreparedStatementPerConnectionSize(maxPoolPreparedStatementPerConnectionSize);
        return datasource;
    }

    @Bean(name = "hiveQueueTemplate")
    public JdbcTemplate hiveQueueTemplate(@Qualifier("hiveQueueDataSource") DataSource dataSource) {
        return new JdbcTemplate(dataSource);
    }

}

调用

    @Autowired
    @Qualifier("hiveQueueTemplate")
    private JdbcTemplate hiveQueueTemplate;

##遇到的问题

BeanPropertyRowMapper映射失败

因为默认hive查询返回的列名中包含表名导致映射失败,需要修改hive配置
Hue中hive(hive cli)中显示列名,不带表名

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值