maven添加依赖
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.4.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.4.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.4.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-mllib -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_2.11</artifactId>
<version>2.4.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.alibaba/fastjson -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.58</version>
</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-spark-20_2.11</artifactId>
<version>7.2.0</version>
</dependency>
spark 从 Elasticsearch读取数据
import org.apache.spark.{SparkConf, SparkContext}
import org.elasticsearch.spark._
object ESToSpark {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("hello world").setMaster("local[*]")
conf.set("es.index.auto.create", "true")
conf.set("es.nodes","127.0.0.1")
conf.set("es.port","9200")
val sc = new SparkContext(conf)
val query: String =
s"""{
"query": {
"term": {
"name": {
"value": "鲁仲连"
}
}
}
}"""
val rdd = sc.esRDD("phonebills",query)
rdd.collect().foreach(println)
println(rdd.count()+" -----------")
sc.stop()
}
}
spark 向 Elasticsearch中写入数据
1.先在es中新建索引指定字段
spark字段 | es 字段 |
---|---|
String | text/keyword |
Long | long |
String | text/keyword |
Integer | number |
String/Long | date |
Double | number |
Int | number |
其中向es中写入date类型时,需要先创建索引并指定mapping的format,不然会被当做text类型处理
PUT xxx
{
"mappings": {
"properties": {
"a": {
"type": "keyword"
},
"b": {
"type": "keyword"
},
"c": {
"type": "long"
},
"time": {
"type": "date",
"format":["yyyy-MM-dd HH:mm:ss"]
},
"d": {
"type": "text"
},
"e": {
"type": "keyword"
},
"f": {
"type": "text"
},
"g": {
"type": "keyword"
}
}
}
}
val spark = SparkSession.builder()
.master("local[8]")
.config("es.index.auto.create", "true")
.config("es.nodes", "127.0.0.1")
.config("es.port", "9200")
.appName("log")
.getOrCreate()
val sc = spark.sparkContext
// val rdd = sc.textFile(path)
// case class xx()
// val log = rdd.map(x=>xx())
val rlog = spark.createDataFrame(log)
rlog.saveToEs("dblog")