一 需求描述
使用Spark读取本地Json文件,将读出的电影数据写入到Es中
按照电影的Id进行分组统计,统计出每个电影的平均分,升序
二 依赖
<!--es 相关依赖开始-->
<dependency>
<groupId>io.searchbox</groupId>
<artifactId>jest</artifactId>
<version>6.3.1</version>
</dependency>
<dependency>
<groupId>net.java.dev.jna</groupId>
<artifactId>jna</artifactId>
<version>4.5.2</version>
</dependency>
<dependency>
<groupId>org.codehaus.janino</groupId>
<artifactId>commons-compiler</artifactId>
<version>2.7.8</version>
</dependency>
三 手动指定MovieMapping
PUT movie_info
{
"mappings": {
"_doc":{
"properties": {
"movie":{
"type": "keyword"
},
"rate":{
"type": "long"
},
"timeStamp":{
"type": "long"
},
"uid":{
"type": "keyword"
}
}
}
}
}
四 写数据到Es中
4.1 写入Es工具类
package com.gc.util
import io.searchbox.client.{JestClient, JestClientFactory}
import io.searchbox.client.config.HttpClientConfig
import io.searchbox.core.{Bulk, Index}
/**
* Es Utils
*
*/
object MyEsUtils {
val esUrl: String ="http://hadoop102:9200" // es 集群地址
val factory: JestClientFactory = new JestClientFactory() // 创建工厂类
val config: HttpClientConfig = new HttpClientConfig.Builder(esUrl) // 设置配置参数
.multiThreaded(true)
.maxTotalConnection(20)
.connTimeout(10000)
.readTimeout(10000)
.build()
factory.setHttpClientConfig(config) //给工厂类设置参数
// 定义方法返回客户端
def getClient(): JestClient ={
factory.getObject
}
// 逐条写入
def insertIntoEs(jestClient: JestClient,indexName:String,source:Any)={
val indexAction: Index = new Index.Builder(source).index(indexName).`type`("_doc").build()
jestClient.execute(indexAction);
}
// 逐条写入
def insertIntoEs(indexName:String,source:Any)={
val client: JestClient = factory.getObject
val indexAction: Index = new Index.Builder(source).index(indexName).`type`("_doc").build()
client.execute(indexAction);
client.close()
}
/**
* 批量插入 对接Rdd的分区数据
* @param indexName
* @param sources
*/
def insertBulk(indexName:String,sources: Iterator[Any])={
val client: JestClient = factory.getObject
val bulkBilder: Bulk.Builder = new Bulk.Builder().defaultIndex(indexName).defaultType("_doc")
// 对数据进行匹配
sources.foreach(
{
case (id:String,source)=>{ // 如果传递了Id则使用传的Id
bulkBilder.addAction(new Index.Builder(source).id(id).build())
}
case source=>{
bulkBilder.addAction(new Index.Builder(source).build()) //没传Id则使用随机生成的
}
}
)
client.execute(bulkBilder.build())
client.close()
}
}
4.2 测试
package com.gc.util
import com.alibaba.fastjson.JSON
import io.searchbox.client.JestClient
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object EsDemo {
// 单挑插入
def insertSingleIntoEs(rdd: RDD[Any])={
rdd.foreachPartition(it=>{
// 获取连接
val jestClient: JestClient = MyEsUtils.getClient() // 一个分区获取一次连接
it.foreach(movie=>{
MyEsUtils.insertIntoEs(jestClient,"movie_info",movie) //逐行插入
})
jestClient.close()
})
}
// 批次插入
def insertBatchIntoEs(rdd: RDD[Any])={
if(rdd.count()>10000*10){
rdd.repartition((rdd.count()/10000).toInt)// 从新分区保证每次分区的数据最多为10000条
}
rdd.foreachPartition(it=>{
MyEsUtils.insertBulk("movie_info",it);
})
}
def main(args: Array[String]): Unit = {
val dataPath="E:\\hadoop-need-kenow\\mrdata\\json";
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("EsDemo")
val spark: SparkContext = new SparkContext(conf)
// 通过spark
val rdd: RDD[Any] = spark.textFile(dataPath)
.map(line=>{
try{
JSON.parseObject(line,classOf[Movie])
}catch {
case e:Exception=>
}
})
// insertSingleIntoEs(rdd); //对Rdd的数据单条插入
insertSingleIntoEs(rdd); // 将数据批量插入到es
}
case class Movie(movie:String,rate:Int,timeStamp:Long,uid:String) // 样例类
五 从es中查询数据
按照电影的Id进行分组统计,统计出每个电影的平均分,升序
package com.gc.util
import java.util
import io.searchbox.client.JestClient
import io.searchbox.core.search.aggregation.TermsAggregation
import io.searchbox.core.{Search, SearchResult}
object EsQueryDemo {
def main(args: Array[String]): Unit = {
val client: JestClient = MyEsUtils.getClient()
val currentPage=1;
val pageSize=10;
val dsl=
s"""
|{
| "from": ${(currentPage-1)*pageSize},
| "size": ${pageSize},
| "aggs":{
| "groupby_movie":{
| "terms":{
| "field": "movie",
| "order":{
| "avg_rate":"asc"
| }
| },
|
| "aggs": {
| "avg_rate": {
| "avg": {
| "field": "rate"
| }
| }
| }
| }
|
| }
|}
""".stripMargin
val search: Search = new Search.Builder(dsl).addIndex("movie_info").addType("_doc").build()
val resultSet: SearchResult = client.execute(search)
val hits: util.List[SearchResult#Hit[util.HashMap[String, Any], Void]] = resultSet.getHits(classOf[util.HashMap[String, Any]])
import scala.collection.JavaConversions._
for(hit <- hits){
println(hit.source) // 获取聚合内容
}
println("总数:"+resultSet.getTotal)
// 获取聚合结果
val buckets: util.List[TermsAggregation#Entry] = resultSet.getAggregations.getTermsAggregation("groupby_movie").getBuckets
for(bucket <- buckets){
// 封装数据 从当前 bucket中 获取 求平均值的结果
println((bucket.getKey, bucket.getCount,bucket.getAvgAggregation("avg_rate").getAvg))
}
client.close()
}
}
结果如下图所示: