SparkSql 读取elasticsearch 表数据
1 版本
Spark 2.3.2 ,elasticsearch 5.3.3 ,scala 2.11
2 Pom.xml 部分依赖
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-spark-20_2.11</artifactId>
<version>5.3.3</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.spark.version}</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>${spark.version}</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
</exclusions>
3 spark-sql读写ES
object App extends App {
println( "Hello World!" )
val conf = new SparkConf().setAppName("app1").setMaster("local[2]")
conf.set("es.index.auto.create","true"); //在spark中自动创建es中的索引
conf.set("es.nodes","192.168.220.128");//设置在spark中连接es的url和端口
conf.set("es.port","9200");
conf.set("es.nodes.wan.only","true");
val spark =SparkSession.builder().config(conf).getOrCreate()
//import spark.implicits._
//read elasticsearch layout session table
val essessionDataFrame = spark.sqlContext.read
.format("org.elasticsearch.spark.sql")
.option("inferSchema", "true").load("index/type")
essessionDataFrame.createOrReplaceTempView("sessionTable")
essessionDataFrame.show()
val rdd = essessionDataFrame.rdd
//println(s"rdd = ${rdd}")
rdd.saveToEs("index/type")
spark.stop()
println("over....")
}
4 RDD 读写elaticsearch
引用:阿粒_lxf
链接:https://www.jianshu.com/p/8cf8b89b06e6 来源:简书
import org.apache.spark.rdd.RDD
import org.elasticsearch.spark._
object LoadElasticsearchData {
def main(args: Array[String]): Unit = {
val sc = new SparkContext(
new SparkConf()
.setAppName("e2e.computing.test")
.setMaster("local[*]")
.set("spark.cassandra.connection.host", "192.168.14.141")
//.set("es.nodes", "192.168.14.140")
//192.168.7.130:9200
.set("es.nodes", "192.168.7.130")
.set("es.port", "9200")
.set("es.index.auto.create", "true")
.set("es.mapping.date.rich", "false")
)
// ES的RDD test_lxf query = "查询串" elasticsearch.spark 默认全部查出数据
val query =
s"""
|{
| "query": {
| "match_all": {}
| }
|}
""".stripMargin
val esRdd = sc.esRDD(s"index/type", query)
}
spark RDD 写ES
import com.ffcs.itm.e2e.test.util
import org.elasticsearch.spark._
object SaveElasticsearch {
def main(args: Array[String]): Unit = {
val sc = new SparkContext(
new SparkConf()
.setAppName("e2e.computing.test")
.setMaster("local[*]")
.set("spark.cassandra.connection.host", "192.168.14.141")
//.set("es.nodes", "192.168.14.140")
//192.168.7.130:9200
.set("es.nodes", "192.168.7.130")
.set("es.port", "9200")
.set("es.index.auto.create", "true")
.set("es.mapping.date.rich", "false")
)
val airports = Map("OTP" -> "Otopeni", "SFO" -> "San Fran")
//不存在就新建
sc.makeRDD(Seq(airports)).saveToEs("index/type")
}
}