使用ES的初衷就是快速检索,他的默认输出数据是一万条,并不适合大数据的处理计算。要是需要导出或者将取出的数据再做二次处理的话,就需要换组件。
首先需要将hive中的数据写入ES,要是涉及到经纬度,需要lat,lng的格式放到location字段里面。
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SparkSession}
import org.elasticsearch.spark.rdd.EsSpark
/**
* Created by yinyi on 2019/9/3.
*/
object BigWide_V2{
def main(args: Array[String]) {
val spark = SparkSession
.builder()
.appName("TestES")
.config("spark.sql.warehouse.dir", Utils.HIVEWAREHOUSELOCATION)
.config("es.nodes", "10.0.4.67 ,0.0.4.235,0.0.4.161,0.0.4.90 ,0.0.4.111") //es的节点,多个用逗号分隔
.config("es.index.auto.create", "true") //开启自动创建索引
.config("index.refresh_interval", "-1") //设置为-1以禁用刷新
.config("es.nodes",Utils.HOST_ES) //es的节点,多个用逗号分隔
// .config("es.write.operation","upsert")//表示如果id重复就更新数据;
.config("es.mapping.date.rich","false")
.config("spark.sql.shuffle.partitions","6000")
.enableHiveSupport()
.getOrCreate()
val los = spark.sql("select name,mobile ,province,city,county,concat_ws(',',nvl(lat,'0.0'),nvl(lng,'0.0')) as location,child_english_label," +
" chusan_gaosan_label,vippl_label,xinyongka_label,cos_med_label,age_label,sex_label,edu_label," +
" marry_status_label,income_level_label,consume_level_label,month_consume_pinci_label " +
" from precisionmarketing.result_190905_sugang_jingqing_big_wide_label "+
" where mobile rlike '^1[3456789]\\\\d{9}$' " +
" and lng >-180 and lng<180 and lat>-90 and lat<90 " +
"").rdd
//lter