一·、场景:Spark操作es数据
环境版本:Spark2.0.*、elasticsearch5.0.0、Scala2.11.*
二、代码实现
工程需添加spark依赖包及elasticsearch-spark-20_2.11-5.00.jar;
elasticseaerch-spark插件包可以从https://www.elastic.co/downloads/hadoop下载;
package cn.com.git.scala.spark.es
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.log4j.Level
import org.apache.log4j.Logger
object SparkSqlHandleES {
def main(args: Array[String]) {
// 屏蔽控制台Log
Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
// 初始化SparkContext
val sc = new SparkContext("local", "es", System.getenv("SPARK_HOME"), Seq(System.getenv("SPARK_TEST_JAR")))
// 初始化sqlContext
val sqlContext = new SQLContext(sc)
// 连接ES节点的信息
val options = Map("pushdown" -> "true", "es.nodes" -> "10.100.85.198", "es.port" -> "9600")
// 根据"索引/类型"(customer/user)加载数据
val sparkDF = sqlContext.read.format("org.elasticsearch.spark.sql").options(options).load("customer/user")
sparkDF.registerTempTable("user")
var userTable = sqlContext.sql("select * from user")
userTable.show();
sparkDF.select("name", "phone").collect().foreach { println(_) }
}
}
三、执行结果
17/08/02 15:57:48 INFO Version: Elasticsearch Hadoop v5.0.0 [b3b70377fb]
17/08/02 15:57:51 INFO ScalaEsRowRDD: Reading from [customer/user]
[Stage 1:> (0 + 1) / 4]
[Stage 1:==============> (1 + 1) / 4]
[Stage 1:=============================> (2 + 1) / 4]
[Stage 1:============================================> (3 + 1) / 4]
+---+--------+-----------+---+
|age| name| phone|sex|
+---+--------+-----------+---+
| 20|zhangsan|13379873453| M|
| 20| lisi|13379434343| M|
+---+--------+-----------+---+
17/08/02 15:57:55 INFO ScalaEsRowRDD: Reading from [customer/user]
[Stage 2:> (0 + 1) / 5]
[Stage 2:===========> (1 + 1) / 5]
[Stage 2:=======================> (2 + 1) / 5]
[Stage 2:===================================> (3 + 1) / 5]
[Stage 2:===============================================> (4 + 1) / 5]
[zhangsan,13379873453]
[lisi,13379434343]