前言:因为需要需要将 mysql 中的 4张表连接查询 最后将数据导入es 中
需要的jar包:
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
<!--<scope>provided</scope>-->
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql_2.11 -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
<!--<scope>provided</scope>-->
</dependency>
<!-- https://mvnrepository.com/artifact/commons-codec/commons-codec -->
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
<version>1.10</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>21.0</version>
</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-spark-20_2.11</artifactId>
<version>5.5.3</version>
</dependency>
代码:
package com.doctorai.dpnice.testdemo
import com.doctorai.dpnice.StructuredData14
import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.elasticsearch.hadoop.cfg.ConfigurationOptions
import org.elasticsearch.spark.sql.EsSparkSQL
/**
* @author DPn!ce date 2018 03 13 下午 7:25
*/
object SparkToES {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().
setAppName("SparkToES")
.setMaster("local[5]")
//调节任务并行度 没有reduce 的 时候不起作用 理解的 其实就是 分区的时候 hash 分区 分成16
sparkConf.set("spark.default.parallelism", "5")
//因为 gc 超过总任务时间的 1/10 要将spark.storage.memoryFraction 的比例调小 或者将缓存级别设置为可序列化的
//spark.shuffle.memoryFraction
// sparkConf.set("spark.storage.memoryFraction", "0.6")
//spark.sql.shuffle.partitions
sparkConf.set("spark.sql.shuffle.partitions", "5")
/*
Compressed oops(压缩了的普通对象指针)。它是JVM的优化技术之一。
compressed oops可以把堆大小从3.6MB减小至3.1MB。这也意味着多了差不多14%可用的堆。
显而易见,使用compressed oops并没有什么坏处,并且它带给你的可能还是好处。对于编
译器的一些细节的了解可以帮助写出高效的代码
*/
sparkConf.set("spark.executor.extraJavaOptions",
"""
|-XX:+UseG1GC
|-XX:+UseCompressedOops
""".stripMargin)
sparkConf.set("spark.logConf", "false")
// 指定序列化处理类
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
//开启推测机制
sparkConf.set("spark.speculation", "true")
sparkConf.set("spark.driver.memory", "4g")
sparkConf.set("spark.executor.memory", "5g")
//total-executor-cores 24 --executor-cores 4
// sparkConf.set("spark.total.executor.cores", "16")
// sparkConf.set("spark.executor.cores", "4")
sparkConf.set("spark.eventLog.enabled", "false")
// sparkConf.set("spark.local.dir", "F:\\spark_local_dir")
sparkConf.set("pushdown", "true")
sparkConf.set("es.port", "9200")
sparkConf.set("es.nodes", "192.168.0.201")
// sparkConf.set("es.net.ssl","true")
// sparkConf.set("es.net.ssl.keystore.location","C:\\Users\\Administrator\\Desktop\\jks\\node-0-keystore.jks")//"D:\\test_jar\\node-0-keystore.jks"
// sparkConf.set("es.net.ssl.keystore.pass","changeit")
// sparkConf.set("es.net.ssl.keystore.type","JKS")
// sparkConf.set("es.net.ssl.truststore.location","C:\\Users\\Administrator\\Desktop\\jks\\truststore.jks")//在linux上运行的时候此处路径写file:///全路径
// sparkConf.set("es.net.ssl.truststore.pass","changeit")
// sparkConf.set("es.net.ssl.cert.allow.self.signed","false")
// sparkConf.set("es.net.ssl.protocol","TLSv1.2")//官网上是TLS,必须带版本号,否则报错
sparkConf.set("es.index.auto.create", "true")
// sparkConf.set("es.net.http.auth.user", "elastic") //访问es的用户名
// sparkConf.set("es.net.http.auth.pass", "changeme") //访问es的密码
// sparkConf.set("es.nodes.wan.only","true")
// sparkConf.set("es.index.read.missing.as.empty","true")
// sparkConf.set("es.scroll.size", "10000000")
sparkConf.set("es.nodes.discovery", "true")
// sparkConf.set("es.read.field.include", "字段")
val spark = SparkSession
.builder()
.config(sparkConf)
.getOrCreate()
val reader = spark.read.format("jdbc")
reader.option("url", "jdbc:mysql://192.168.0.200:3306/quality_control?characterEncoding=utf-8&useSSL=false") //数据库路径
reader.option("driver", "com.mysql.cj.jdbc.Driver")
reader.option("user", "和谐")
reader.option("password", "和谐")
val sql =
"""
|(SELECT
| qce.id qce_id,
| qce.qc_content_id qc_content_id,
| qc.qc_order_id qc_order_id,
| qce.emr_id emr_id,
| mr.main_diagnosis,
| qce.department,
| qo.qc_source qc_source,
| qo.qc_project qc_project,
| qc.qc_type qc_type,
| qc.qc_content,
| qc.create_time create_time,
| qc.update_time update_time
|FROM
| t_quality_content qc
| LEFT JOIN t_quality_order qo ON qc.qc_order_id = qo.qc_order_id
| LEFT JOIN t_quality_content_emr qce ON qc.id = qce.qc_content_id
| LEFT JOIN t_medical_record mr ON mr.emr_id = qce.emr_id
| ) AS quality_control
""".stripMargin
reader.option("dbtable", sql) //数据表名 或者 sql 语句
val load: DataFrame = reader.load
EsSparkSQL.saveToEs(load, "quality_control/my_type")
}
}
备注:reader.option("dbtable", sql) 有些 盆友 认为 只能写 表名,其实不然,可以填一条sql 语句 但是 必须 设置一个 表名。
ES的mapping 可以提前设置 ,也可以 动态生成,动态生成会根据mysql 的字段类型设置。