文章目录
前言
CDH6 + IMPALA
本地数据上传到 - > hdfs - >导入impala
Project
Application.yml
# 导入配置
import-config:
csv-tmp-path: D:/test/tmp
impala-url: jdbc:impala://x.x.x.x:21050/default
impala-user: hue
impala-password: hue
hdfs-uri: hdfs://x.x.x.x:8020
hdfs-user: hue
hdfs-tmp-path: /home/data/tmp
Core-code
配置类
/**
* <b><code>ImportConfig</code></b>
* <p/>
* Description
* <p/>
* <b>Creation Time:</b> 2019/6/12 16:58.
*
* @author Hu-Weihui
*/
@Component
@Data
@ConfigurationProperties(prefix = "import-config")
public class ImportConfig {
private String csvTmpPath;
private String impalaUrl;
private String impalaUser;
private String impalaPassword;
private String hdfsUser;
private String hdfsUri;
private String hdfsTmpPath;
}
数据导入
关键点:
1.APPEND/OVERWRITE我是自定义的枚举类用于对应IMPALA的追加OR覆盖
2.执行完LOAD DATA 命令后,IMPALA一定要执行REFRESH [TABLE]操作
3.很多坑爹博客会让你用HIVE执行,注意用HIVE-JDBC执行REFRESH [TABLE]会报错,无法识别
4.去CDH下载IMPALA的JDBC驱动包
/**
* IMPALA数据导入
*
* @param tableName 表名
* @param updateMethod APPEND/OVERWRITE(追加OR覆盖)
* @param multipartFile 客户端上传的文件
*/
@Override
public void importImpalaData(String tableName, String updateMethod, MultipartFile multipartFile) {
// 1.csv 保存到loacal(本机/本地服务器)
File localFile = saveToLocal(multipartFile);
String localFilePath = localFile.getPath();
String hdfsDstPath = importConfig.getHdfsTmpPath() + "/" + localFile.getName();
// 2.上传到hdfs上
Path srcPath = new Path(localFilePath);
Path dstPath = new Path(hdfsDstPath);
Path hdfsPath = new Path(importConfig.getHdfsTmpPath());
try {
Configuration configuration = new Configuration();
URI hdfsUri = new URI(importConfig.getHdfsUri());
FileSystem fileSystem = FileSystem.get(hdfsUri, configuration ,importConfig.getHdfsUser());
if (!fileSystem.exists(hdfsPath)) {
fileSystem.mkdirs(hdfsPath);
}
fileSystem.copyFromLocalFile(srcPath, dstPath);
} catch (URISyntaxException e) {
log.error("the uri have some error :", e);
throw new DataManagementException("上传到数据失败");
} catch (IOException e) {
log.error("con not get FileSystem :", e);
throw new DataManagementException("上传到数据失败");
} catch (InterruptedException e) {
log.error("InterruptedException :", e);
throw new DataManagementException("上传到数据失败");
}
Connection connection = null;
Statement statement = null;
try {
// 3. impala 使用 Load命令从 hdfs 导入数据
String url = importConfig.getImpalaUrl();
String user = importConfig.getImpalaUser();
String password = importConfig.getImpalaPassword();
Class.forName("com.cloudera.impala.jdbc41.Driver");
connection = DriverManager.getConnection(url, user, password);
// load data from hdfs
String loadSql = "LOAD DATA INPATH '" + hdfsDstPath + "' INTO TABLE " + tableName;
if (updateMethod.equals(UpdateMethod.OVERRIDE.getCode())) {
loadSql = "LOAD DATA INPATH '" + hdfsDstPath + "'OVERWRITE INTO TABLE " + tableName;
}
statement = connection.createStatement();
statement.execute(loadSql);
// refresh the impala table
String refreshResult = String.format("REFRESH %s", tableName);
statement.execute(refreshResult);
} catch (ClassNotFoundException e) {
log.error("load impala driver class fail :", e);
throw new DataManagementException("导入数据失败");
} catch (SQLException e) {
log.error("can not to load hdfs data into impala :", e);
throw new DataManagementException("导入数据失败");
} finally {
if (statement != null) {
try {
statement.close();
} catch (SQLException e) {
log.error(" can not close statement: ", e);
}
}
if (connection != null) {
try {
connection.close();
} catch (SQLException e) {
log.error(" can not close connection: ", e);
}
}
}
}
爬坑日志
下载IMPALA驱动包
去官方网站
CDH官网:https://www.cloudera.com/
DOWNLOAD-> 下拉找到Database Drivers -> Impala JDBC Driver Downloads
下载地址:https://www.cloudera.com/downloads/connectors/impala/jdbc/2-6-12.html
引入外部Jar包&打包的时候引入外部依赖JAR包
<!--项目根目录创建lib,并把jar包复制进去-->
<dependency>
<groupId>com.cloudera.impala</groupId>
<artifactId>jdbc</artifactId>
<version>2.6.12</version>
<scope>system</scope>
<systemPath>${project.basedir}/../lib/ImpalaJDBC41.jar</systemPath>
</dependency>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<configuration>
<fork>true</fork>
<finalName>venus-gzzc-bi-be</finalName>
<mainClass>com.richstonedt.ht.gzzc.Application</mainClass>
<fork>true</fork>
<includeSystemScope>true</includeSystemScope><!--外部依赖打入JAR包的关键-->
</configuration>
<executions>
<execution>
<goals>
<goal>repackage</goal>
</goals>
</execution>
</executions>
</plugin>
使用IMPALA-URL连接而不是HIVE
Class.forName("com.cloudera.impala.jdbc41.Driver");
String url = "jdbc:impala://x.x.x.x:21050/default";
connection = DriverManager.getConnection(url, user, password);
生产环境请看
集成了kerberos并踩了很多坑
https://blog.csdn.net/HuHui_/article/details/94741104
Author
作者:HuHui
转载:欢迎一起讨论web和大数据问题,转载请注明作者和原文链接,感谢