写在前面:
idea sbt 项目
spark2.2.0
cdh6.0.1
elasticsearch7.2.0
step1.需要sbt依赖
name := "biz_xy_diy" version := "0.1" scalaVersion := "2.11.8" resolvers ++= Seq( "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cdh-releases-rcs/", "Elasticsearch Repository" at "https://s3.amazonaws.com/download.elasticsearch.org/lucenesnapshots/83f9835" ) libraryDependencies += "org.apache.spark"%"spark-core_2.11"%"2.2.0-cdh6.0.1" libraryDependencies += "org.apache.spark"%"spark-sql_2.11"%"2.2.0-cdh6.0.1" libraryDependencies += "org.apache.spark" % "spark-streaming_2.11" % "2.2.0-cdh6.0.1" libraryDependencies += "org.apache.spark" % "spark-streaming-kafka-0-10_2.11" % "2.2.0-cdh6.0.1" libraryDependencies += "org.apache.spark" % "spark-mllib_2.11" % "2.2.0-cdh6.0.1" libraryDependencies += "org.apache.spark" % "spark-hive_2.11" % "2.2.0-cdh6.0.1" libraryDependencies += "org.apache.hbase" % "hbase-client" % "2.0.0-cdh6.0.1" libraryDependencies += "org.apache.hbase" % "hbase-common" % "2.0.0-cdh6.0.1" libraryDependencies += "org.apache.hbase" % "hbase-server" % "2.0.0-cdh6.0.1" libraryDependencies += "org.apache.hbase" % "hbase-mapreduce" % "2.0.0-cdh6.0.1" libraryDependencies += "org.apache.hbase" % "hbase-spark" % "2.0.0-cdh6.0.1" libraryDependencies += "mysql" % "mysql-connector-java" % "5.1.6" // https://mvnrepository.com/artifact/org.apache.hbase/hbase libraryDependencies += "org.apache.hbase" % "hbase" % "2.0.0-cdh6.0.1" pomOnly() // https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common libraryDependencies += "org.apache.hadoop" % "hadoop-common" % "3.0.0-cdh6.0.1" libraryDependencies += "com.fasterxml.jackson.core" % "jackson-core" % "2.6.5" dependencyOverrides += "com.fasterxml.jackson.core" % "jackson-databind" % "2.6.5" dependencyOverrides += "com.fasterxml.jackson.module" % "jackson-module-scala_2.11" % "2.6.5" libraryDependencies += "net.sf.json-lib" % "json-lib" % "2.3" from "http://repo1.maven.org/maven2/net/sf/json-lib/json-lib/2.3/json-lib-2.3-jdk15.jar" libraryDependencies += "org.elasticsearch.client" % "elasticsearch-rest-high-level-client" % "7.2.0" libraryDependencies += "org.elasticsearch" % "elasticsearch" % "7.2.0" libraryDependencies += "junit" % "junit" % "4.12" % Test libraryDependencies += "org.apache.logging.log4j" % "log4j-core" % "2.12.0" // https://mvnrepository.com/artifact/log4j/log4j libraryDependencies += "log4j" % "log4j" % "1.2.17"
step2.封装工具类 ESUtils(java)
package cms; import org.elasticsearch.action.bulk.BackoffPolicy; import org.elasticsearch.action.bulk.BulkProcessor; import org.elasticsearch.action.bulk.BulkRequest; import org.elasticsearch.action.bulk.BulkResponse; import org.elasticsearch.client.RequestOptions; import org.elasticsearch.client.RestHighLevelClient; import org.elasticsearch.common.unit.ByteSizeUnit; import org.elasticsearch.common.unit.ByteSizeValue; import org.elasticsearch.common.unit.TimeValue; public class ESUtils { /** * 获取监听 * @return */ public static BulkProcessor.Listener getBulkListener(){ BulkProcessor.Listener listener = new BulkProcessor.Listener() { @Override public void beforeBulk(long executionId, BulkRequest request) { } @Override public void afterBulk(long executionId, BulkRequest request, BulkResponse response) { } @Override public void afterBulk(long executionId, BulkRequest request, Throwable failure) { } }; return listener; } /** * 获取处理器 * @param client * @param listener * @param bulkcnumber * @return * @throws InterruptedException */ public static BulkProcessor getBulkprocessor(RestHighLevelClient client, BulkProcessor.Listener listener,int bulkcnumber) throws InterruptedException { BulkProcessor bulkProcessor = BulkProcessor.builder( (request, bulkListener) -> client.bulkAsync(request, RequestOptions.DEFAULT, bulkListener), listener).build(); BulkProcessor.Builder builder = BulkProcessor.builder( (request, bulkListener) -> client.bulkAsync(request, RequestOptions.DEFAULT, bulkListener), listener); builder.setBulkActions(bulkcnumber); builder.setBulkSize(new ByteSizeValue(1L, ByteSizeUnit.MB)); builder.setConcurrentRequests(0); builder.setFlushInterval(TimeValue.timeValueSeconds(10L)); builder.setBackoffPolicy(BackoffPolicy .constantBackoff(TimeValue.timeValueSeconds(1L), 3)); return bulkProcessor; } }
step3.从mysql 写入 es(异步提交)
class ReadMsql2EsAsync extends Serializable { val log = Logger.getLogger("org").setLevel(Level.ERROR) val HOST = "yourhostname" //var client: RestHighLevelClient = null val PORT = 9200 val HTTP = "HTTP" val MODEL_LOCAL = "local[*]"// yarn local[*] def readMysql : DataFrame = { val sparkSession = SparkSession .builder() .appName("testwES") .master(MODEL_LOCAL) .getOrCreate() val pro = new Properties() pro.setProperty("user", "username") pro.setProperty("password", "*****") val df = sparkSession.read.jdbc("jdbc:mysql://yourhostname:3306/test","vendor2",pro) df } def w2es (): Unit ={ val df = readMysql val indexName = "test_index" //索引名称 val indexType = "cms_su_dtl" //索引类型 val start = System.currentTimeMillis() df.foreachPartition(par =>{ var i = 0 val blukRequest = new BulkRequest() blukRequest.timeout("3m") val listener = ESUtils.getBulkListener val client = new RestHighLevelClient(RestClient.builder(new HttpHost(HOST, PORT, HTTP))) //获取皮操作处理 val bulkprocessor = ESUtils.getBulkprocessor(client,listener,400) //重用map val mapf = new util.HashMap[String, Any]() if(!par.isEmpty){ par.foreach(x =>{ mapf.put("id", x.get(x.fieldIndex("id"))) mapf.put("name", x.get(x.fieldIndex("name"))) mapf.put("serialNumber", x.get(x.fieldIndex("serialNumber"))) mapf.put("price", x.get(x.fieldIndex("price"))) mapf.put("stock_number", x.get(x.fieldIndex("stock_number"))) // mapf.put("create_time", x.get(x.fieldIndex("create_time"))) mapf.put("venName", x.get(x.fieldIndex("venName"))) //println(mapf) val id = mapf.get("id").toString val request = new IndexRequest(indexName,indexType) request.id(id) request.timeout("3m") request.source(mapf) bulkprocessor.add(request) }) } bulkprocessor.awaitClose(30L, TimeUnit.SECONDS) bulkprocessor.close() client.close() }) val end = System.currentTimeMillis() val spend = (end - start)/1000 Logger.getLogger("spend").info(indexName+ " :" + spend + " s") } }