<repositories>
<repository>
<id>central</id>
<url>http://maven.aliyun.com/nexus/content/groups/public/</url>
<snapshots>
<enabled>true</enabled>
</snapshots>
</repository>
</repositories>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<spark.version>2.3.3</spark.version>
<elasticsearch.version>5.6.9</elasticsearch.version>
</properties>
<dependencies>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-spark-20_2.11</artifactId>
<version>${elasticsearch.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-yarn_2.11</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.6.0</version>
<configuration>
<encoding>utf-8</encoding><!-- 指定项目源文件编码 -->
<source>${maven.compiler.source}</source><!-- 指定项目源文件jdk版本 -->
<target>${maven.compiler.target}</target><!-- 指定项目生成目标文件的jdk版本 -->
</configuration>
</plugin>
</plugins>
</build>
Spark代码
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.*;
import java.util.HashMap;
import java.util.Map;
public class SparkEsToEs{
public static void main(String[] args) {
Logger.getRootLogger().setLevel(Level.DEBUG);
SparkConf sparkConf = new SparkConf().setMaster("local[1]").setAppName("SparkEsToEs");
sparkConf.set("spark.sql.warehouse.dir","file:///D://test");
SparkSession sparkSession = SparkSession.builder().config(sparkConf).getOrCreate();
Dataset<Row> esDataset = sparkSession.read().format("es").options(initEsSourceOptions()).load("/goods_old_index/docs");
//esDataset.show();
Dataset<Row> esDatasetNew = esDataset.withColumn("new_data_key", functions.concat_ws("_",esDataset.col("goods_code"), esDataset.col("shop_code")));
//写入es中,指定index/type
esDatasetNew.write()
.format("org.elasticsearch.spark.sql")
.options(initEsDestOptions())
.mode(SaveMode.Append)
.save("/goods_new_index/docs");
}
private static Map<String, String> initEsSourceOptions() {
Map<String, String> options = new HashMap<>(6);
options.put("es.nodes", "127.0.0.1");
options.put("es.port", "9200");
options.put("es.mapping.id", "goods_code");
/*options.put("es.write.operation", "upsert");
options.put("es.index.auto.create", "true");
options.put("es.nodes.wan.only", "true");*/
return options;
}
private static Map<String, String> initEsDestOptions() {
Map<String, String> options = new HashMap<>(6);
options.put("es.nodes", "127.0.0.1");
options.put("es.port", "9200");
options.put("es.mapping.id", "new_data_key");
/*options.put("es.write.operation", "upsert");
options.put("es.index.auto.create", "true");*/
return options;
}
}