xml依赖
<!-- 声明公有的属性 -->
<properties>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<encoding>UTF-8</encoding>
<scala.version>2.11.8</scala.version>
<spark.version>2.2.0</spark.version>
<hadoop.version>2.7.1</hadoop.version>
<scala.compat.version>2.11</scala.compat.version>
</properties>
<dependencies>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>1.2.1</version>
</dependency>
<dependency>
<groupId>com.cloudera</groupId>
<artifactId>ImpalaJDBC4</artifactId>
<version>2.5.42</version>
</dependency>
</dependencies>
</project>
spark代码
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.jdbc.{JdbcDialect, JdbcDialects}
import org.apache.spark.sql.{SparkSession}
object spark2impala {
def main(args: Array[String]): Unit = {
// todo 注意可能会有jdbc版本冲突的问题 修改xml中版本
// todo 设置日志等级
Logger.getLogger("org").setLevel(Level.WARN)
// 创建sparkSession
val spark = SparkSession.builder()
.appName("spark2impala")
.master("local[*]")
.getOrCreate();
// todo impalaUrl和所需驱动
val impalaUrl = "jdbc:impala://ip:端口;AuthMech=3";
val impalaDriver = "com.cloudera.impala.jdbc4.Driver";
//todo 注册自定义方言 客户端的版本和服务端的版本不一致导致 sql无法查询
JdbcDialects.registerDialect(new JdbcDialect() {
override def canHandle(url: String): Boolean = url.startsWith("jdbc:impala") || url.contains("impala")
override def quoteIdentifier(colName: String): String = colName
})
// todo 查询sql
val sql = "stg_data" + "." + "a1"
val sql1 = "(select id from stg_data.a1 limit 20) a"
val frame = spark.read.format("jdbc")
.option("url", impalaUrl)
.option("driver", impalaDriver)
.option("user", "用户")
.option("password", "密码")
.option("dbtable", sql1)
.load()
frame.show(10)
// frame.write.option("header", "true").csv("test.csv")
}
}