package es import net.minidev.json.parser.JSONParser import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Dataset, SparkSession} import scala.collection.Map /** * Created by Administrator on 2019/3/11. * path 日志路径 * index es索引名称 */ object es { def main(args: Array[String]) { val conf = new SparkConf().setAppName("es") val session: SparkSession = SparkSession.builder().config(conf) .config("es.index.auto.create", "true") .config("es.nodes", "192.168.1.203") .config("es.port","9200").getOrCreate() import session.implicits._ import org.elasticsearch.spark._ val Array(path:String,index:String) =args val json: Dataset[String] = session.read.json(path).toJSON val f1 =json.filter(x =>{ try { val jsonParser = new JSONParser() val parse: AnyRef = jsonParser.parse(x.toString) if(parse.toString.contains("_corrupt_record")){ false }else{ true } }catch { case _ =>false } }) val r1 =f1.map(x=>{ val index: Int = x.indexOf("timestamp") val tmp = x.substring(0,index+22)+"T"+x.substring(index+23) val tmp1 =tmp.substring(0,index+35)+"Z"+tmp.substring(index+35) tmp1 }) val rdd: RDD[String] = r1.rdd val map = Map("es.mapping.timestamp"->"timestamp","es.batch.size.bytes"->"10mb","es.field.read.validate.presence"->"strict") rdd.saveJsonToEs(index+"/log360",map) session.stop() } }
非法json,不会filter时不会直接过滤掉,解析失败是他会自动加上"_corrupt_record",所以要过滤
val f1 =json.filter(x =>{ try { val jsonParser = new JSONParser() val parse: AnyRef = jsonParser.parse(x.toString) if(parse.toString.contains("_corrupt_record")){ false }else{ true } }catch { case _ =>false } })