Spring boot 集成 Hive JDBC
标签(空格分隔): hive
参考Spring boot with Apache Hive 实现
支持多数据源
YARN 配置
Hive JDBC 使用 Spark 引擎,使用 YARN 模式执行任务
YARN 使用 FAIR 调度,Hive 队列最少需要 2G 内存 + 2CPU,因此 Hive 队列只能执行 2 个任务,在后续Druid连接池配置中的最大连接数和此配置对应
fair-scheduler.xml配置如下:
<?xml version="1.0" encoding="UTF-8" ?>
<allocations>
<queue name="default">
<maxResources>9216 mb,9 vcores</maxResources>
<maxRunningApps>1</maxRunningApps>
<minResources>2048 mb,2 vcores</minResources>
<schedulingPolicy>fair</schedulingPolicy>
<weight>2</weight>
</queue>
<!-- 运行多个任务时,需要比最小资源多一点 -->
<queue name="hive">
<maxResources>4096 mb,4 vcores</maxResources>
<maxRunningApps>2</maxRunningApps>
<minResources>2048 mb,2 vcores</minResources>
<schedulingPolicy>fair</schedulingPolicy>
<weight>1</weight>
</queue>
</allocations>
Spring 项目配置
pom.xml
因为公司项目使用logback作为日志插件,去掉log4j等相关引用
引入spring-boot-starter-jdbc和druid,连接词也可以使用tomcat-jdbc
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>2.3.2</version>
<exclusions>
<exclusion>
<groupId>org.eclipse.jetty.aggregate</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<artifactId>log4j</artifactId>
<groupId>log4j</groupId>
</exclusion>
<exclusion>
<artifactId>log4j-1.2-api</artifactId>
<groupId>org.apache.logging.log4j</groupId>
</exclusion>
<exclusion>
<artifactId>log4j-api</artifactId>
<groupId>org.apache.logging.log4j</groupId>
</exclusion>
<exclusion>
<artifactId>log4j-core</artifactId>
<groupId>org.apache.logging.log4j</groupId>
</exclusion>
<exclusion>
<artifactId>log4j-slf4j-impl</artifactId>
<groupId>org.apache.logging.log4j</groupId>
</exclusion>
<exclusion>
<artifactId>slf4j-log4j12</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
<exclusion>
<artifactId>log4j-web</artifactId>
<groupId>org.apache.logging.log4j</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-jdbc</artifactId>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>druid</artifactId>
</dependency>
application.xml
hive:
url: jdbc:hive2://dashuju214:10000,dashuju112:10000,dashuju113:10000/default
# url: jdbc:hive2://dashuju214:10000/default
driver-class-name: org.apache.hive.jdbc.HiveDriver
type: com.alibaba.druid.pool.DruidDataSource
user: hadoop
# password: Pure@123
# 配置获取连接等待超时的时间
maxWait: 60000
# 配置间隔多久才进行一次检测,检测需要关闭的空闲连接,单位是毫秒
timeBetweenEvictionRunsMillis: 60000
# 配置一个连接在池中最小生存的时间,单位是毫秒
minEvictableIdleTimeMillis: 30000
#使用count(1)可以检查hive中Spark Content是否存在
validationQuery: select count(1)
testWhileIdle: true
# 开启后影响性能,可以防止hive丢失连接导致的查询失败
testOnBorrow: true
testOnReturn: false
# 打开PSCache,并且指定每个连接上PSCache的大小
poolPreparedStatements: true
maxPoolPreparedStatementPerConnectionSize: 20
hiveQueue:
queue: hive
# 下面为连接池的补充设置,应用到上面所有数据源中
# 初始化大小,最小,最大,和yarn中的配置对应
initialSize: 1
minIdle: 1
maxActive: 2
经过测试,testOnBorrow开启后,能够保证查询一定执行成功。
HiveQueueConfig.java
package com.bwjf.schedule.config;
import lombok.Getter;
import lombok.Setter;
@Getter
@Setter
public class HiveQueueConfig {
private String queue;
private int initialSize;
private int minIdle;
private int maxActive;
}
HiveDruidConfig.java
package com.bwjf.schedule.config;
import com.alibaba.druid.pool.DruidDataSource;
import lombok.Getter;
import lombok.Setter;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.jdbc.core.JdbcTemplate;
import javax.sql.DataSource;
import java.util.Properties;
@Getter
@Setter
@Configuration
@ConfigurationProperties(prefix = "hive")
public class HiveDruidConfig {
private String url;
private String user;
private String password;
private String driverClassName;
private int maxWait;
private int timeBetweenEvictionRunsMillis;
private int minEvictableIdleTimeMillis;
private String validationQuery;
private boolean testWhileIdle;
private boolean testOnBorrow;
private boolean testOnReturn;
private boolean poolPreparedStatements;
private int maxPoolPreparedStatementPerConnectionSize;
private HiveQueueConfig hiveQueue;
@Bean(name = "hiveQueueDataSource")
public DataSource hiveQueueDataSource() {
return hiveDruidDatasource(hiveQueue);
}
private DruidDataSource hiveDruidDatasource(HiveQueueConfig hiveQueueConfig) {
DruidDataSource datasource = new DruidDataSource();
datasource.setUrl(url);
datasource.setUsername(user);
if (StringUtils.isNotEmpty(password)) {
datasource.setPassword(password);
}
datasource.setDriverClassName(driverClassName);
// pool configuration
datasource.setInitialSize(hiveQueueConfig.getInitialSize());
datasource.setMinIdle(hiveQueueConfig.getMinIdle());
datasource.setMaxActive(hiveQueueConfig.getMaxActive());
Properties properties = new Properties();
properties.setProperty("hiveconf:spark.yarn.queue", hiveQueueConfig.getQueue());
datasource.setConnectProperties(properties);
datasource.setMaxWait(maxWait);
datasource.setTimeBetweenEvictionRunsMillis(timeBetweenEvictionRunsMillis);
datasource.setMinEvictableIdleTimeMillis(minEvictableIdleTimeMillis);
datasource.setValidationQuery(validationQuery);
datasource.setTestWhileIdle(testWhileIdle);
datasource.setTestOnBorrow(testOnBorrow);
datasource.setTestOnReturn(testOnReturn);
datasource.setPoolPreparedStatements(poolPreparedStatements);
datasource.setMaxPoolPreparedStatementPerConnectionSize(maxPoolPreparedStatementPerConnectionSize);
return datasource;
}
@Bean(name = "hiveQueueTemplate")
public JdbcTemplate hiveQueueTemplate(@Qualifier("hiveQueueDataSource") DataSource dataSource) {
return new JdbcTemplate(dataSource);
}
}
调用
@Autowired
@Qualifier("hiveQueueTemplate")
private JdbcTemplate hiveQueueTemplate;
##遇到的问题
BeanPropertyRowMapper映射失败
因为默认hive查询返回的列名中包含表名导致映射失败,需要修改hive配置
Hue中hive(hive cli)中显示列名,不带表名