本文使用的Spark版本是1.4.0,Elasticsearch版本是1.5.2
1.Elasticsearch对Spark的支持详见官网:https://www.elastic.co/guide/en/elasticsearch/hadoop/current/spark.html 官网的文档主要涉及Spark1.2中的RDD API、Spark1.3以上版本的Data Frame API,语言分Java、Scala。具体应用程序的开发应参考自己使用的Spark版本的API,比如使用Spark 1.4以上版本的话写程序的时候跟官网文档中的代码还是有些出入的。
2.确保Elasticsearch集群中的每一个节点的配置文件 config/elasticsearch.yml 中Network And HTTP部分里的http.enabled配置项为true
3.因为我采用的是Spark官网的安装包,它是在Scala2.10上编译的,所以我的Maven工程的dependency如下(elastic官网文档中是2.11):
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-spark_2.10</artifactId>
<version>2.1.0</version>
</dependency>
4.根据我的应用场景,需要实现DBA的存储过程的业务逻辑,demo如下:
package com.elastic.spark;
import java.lang.Double;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.api.java.UDF4;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.SparkConf;
import org.elasticsearch.spark.sql.api.java.JavaEsSparkSQL;
public class Test {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("elastic-spark");
conf.set("es.nodes", "9.115.42.77,9.115.42.89,9.115.42.95");
conf.set("pushdown", "true");
conf.set("es.port", "9200");
JavaSparkContext jsc = new JavaSparkContext(conf);
SQLContext sql = new SQLContext(jsc);
sql.udf().register("concat", new UDF4<String, String,Double,String,String>() {
public String call(String str1,String str2,Double str3,String str4) throws Exception {
return str1+str2+str3.toString()+str4;
}
}, DataTypes.StringType);//concat是用户自定义函数,用来连接字符串
DataFrame cdl_server_register = JavaEsSparkSQL.esDF(sql, "tbsp_compute/cdl_server_register");//读取index/type中的数据
cdl_server_register.registerTempTable("cdl_server_register");
//cdl_server_register.show();
DataFrame cdl_tbs_warn_exception = JavaEsSparkSQL.esDF(sql, "tbsp_compute/cdl_tbs_warn_exception");
cdl_tbs_warn_exception.registerTempTable("cdl_tbs_warn_exception");
//cdl_tbs_warn_exception.show();
DataFrame cdl_tbs_utilization = JavaEsSparkSQL.esDF(sql, "tbsp_compute/cdl_tbs_utilization_2015-09-21");
cdl_tbs_utilization.registerTempTable("cdl_tbs_utilization");
//cdl_tbs_utilization.show();
DataFrame whole_set = sql.sql("SELECT c.HOST_NAME,c.INST_NAME,"
+ "c.DB_NAME,c.TBSP_NAME,c.TBSP_UTILIZATION_PERCENT,c.LOADED_TIMESTAMP,c.TBSP_TYPE,85 as d.THD "
+ "FROM cdl_tbs_utilization c LEFT OUTER JOIN cdl_tbs_warn_exception d on c.HOST_NAME=d.HOST_NAME and d.THD is null and "
+ "c.INST_NAME=d.INST_NAME and c.DB_NAME=d.DB_NAME and c.TBSP_NAME=d.TBS_NAME order by c.TBSP_NAME");
whole_set.show();
whole_set.registerTempTable("whole_set");
System.out.println("the sum of the whole set is "+(int) whole_set.count());
DataFrame utilization_normal_warning = sql.sql("SELECT HOST_NAME,INST_NAME,DB_NAME,TBSP_NAME,TBSP_UTILIZATION_PERCENT,"
+ "LOADED_TIMESTAMP,TBSP_TYPE,85 as THD,concat(TBSP_NAME,' used ',TBSP_UTILIZATION_PERCENT,'% space!') as RESULT FROM whole_set where THD is null and TBSP_UTILIZATION_PERCENT>85 order by TBSP_NAME");
utilization_normal_warning.show((int) utilization_normal_warning.count());
utilization_normal_warning.registerTempTable("utilization_normal");
System.out.println("the sum of the normal set is "+(int) utilization_normal_warning.count());
DataFrame utilization_excep_warning = sql.sql("SELECT c.HOST_NAME,c.INST_NAME,"
+ "c.DB_NAME,c.TBSP_NAME,c.TBSP_UTILIZATION_PERCENT,c.LOADED_TIMESTAMP,c.TBSP_TYPE,d.THD,concat(c.TBSP_NAME,' used ',c.TBSP_UTILIZATION_PERCENT,'% space!') as RESULT "
+ "FROM cdl_tbs_utilization c JOIN cdl_tbs_warn_exception d on c.HOST_NAME=d.HOST_NAME and "
+ "c.INST_NAME=d.INST_NAME and c.DB_NAME=d.DB_NAME and c.TBSP_NAME=d.TBS_NAME where c.TBSP_UTILIZATION_PERCENT>d.THD order by c.TBSP_NAME");
utilization_excep_warning.show((int) utilization_excep_warning.count());
utilization_excep_warning.registerTempTable("utilization_excp");
System.out.println("the sum of the excep set is "+(int) utilization_excep_warning.count());
JavaEsSparkSQL.saveToEs(utilization_normal_warning, "tbsp_compute/warning_data_20150921");
JavaEsSparkSQL.saveToEs(utilization_excep_warning, "tbsp_compute/warning_data_20150921");
}
}
5.demo运行结果: