package com.lz
import org.apache.spark.sql.SparkSession
import org.elasticsearch.hadoop.cfg.ConfigurationOptions
import scala.collection.Map
object Es2Hive {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.appName("es2file by spark")
.config("spark.sql.parquet.binaryAsString", true)
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.config("spark.sql.autoBroadcastJoinThreshold", "-1")
.config("spark.io.compression.codec", "snappy")
.config("spark.dynamicAllocation.enabled", "true")
.getOrCreate()
import org.elasticsearch.spark.sql._
val cfg = Map(
ConfigurationOptions.ES_NODES -> "ip",
ConfigurationOptions.ES_PORT -> "9200",
ConfigurationOptions.ES_SCROLL_SIZE -> "100",
ConfigurationOptions.ES_MAX_DOCS_PER_PARTITION -> "10000",
ConfigurationOptions.ES_HTTP_TIMEOUT -> "15m",
ConfigurationOptions.ES_SCROLL_KEEPALIVE -> "10m",
ConfigurationOptions.ES_BATCH_SIZE_BYTES -> "5mb",
ConfigurationOptions.ES_BATCH_SIZE_ENTRIES -> "10",
"es.internal.spark.sql.pushdown" -> "true")
val frame = spark
.esDF("article1/article", cfg)
frame
.repartition(5)
.write
.mode("overwrite")
.option("header", "true")
.parquet("/zhtmp/test03")
spark.close()
}
}
查看文件提取字段名,hive建表映射数据:
parquet-tools meta part-00000-c0b052bc-d65d-4643-85df-62d01c7f28fb-c000.snappy.parquet