- 在spark程序中引入elasticsearch
- 引入elasticsearch的依赖,将elasticsearch-hadoop上传到集群中,这里scope范围为provided即可。
<dependencies>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-hadoop</artifactId>
<version>2.4.0</version>
<scope>provided</scope>
<exclusions>
<exclusion>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.10</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.10</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.storm</groupId>
<artifactId>storm-core</artifactId>
</exclusion>
<exclusion>
<groupId>cascading</groupId>
<artifactId>cascading-hadoop</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
<repositories>
<repository>
<id>cloudera-repos</id>
<name>Cloudera Repos</name>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
<repository>
<id>Akka repository</id>
<url>http://repo.akka.io/releases</url>
</repository>
<repository>
<id>jboss</id>
<url>http://repository.jboss.org/nexus/content/groups/public-jboss</url>
</repository>
<repository>
<id>Sonatype snapshots</id>
<url>http://oss.sonatype.org/content/repositories/snapshots/</url>
</repository>
<repository>
<id>sonatype-oss</id>
<url>http://oss.sonatype.org/content/repositories/snapshots</url>
<snapshots><enabled>true</enabled></snapshots>
</repository>
</repositories>
- 在代码中使用elasticsearch
import org.elasticsearch.spark.sql._
def main(args: Array[String]): Unit ={
val conf = new SparkConf()
conf.setAppName("Spark Action ElasticSearch")
conf.set("es.index.auto.create", "true")
conf.set("es.nodes","192.168.1.11")
conf.set("es.port","9200")
val sc: SparkContext = new SparkContext(conf)
val sqlContext = new HiveContext(sc)
val df: DataFrame = sqlContext.sql("select * from info limit 50")
//保存数据到ES
df.saveToEs("myindex/info")
从ES中读取数据
val esdf = sqlContext.read.format("org.elasticsearch.spark.sql").load("myindex/info")
esdf.count
sc.stop()
}
- 在spark-shell中引入elasticsearch,以cdh为例。
- 去maven中央仓库下载elasticsearch-hadoop的jar包,将jar包上传到目录:/opt/cloudera/parcels/CDH-5.5.0-1.cdh5.5.0.p0.8/jars/中,在/opt/cloudera/parcels/CDH/lib/spark/conf/classpath.txt(spark的classpath配置文件)文件中最后添加如下内容:
/opt/cloudera/parcels/CDH-5.5.0-1.cdh5.5.0.p0.8/jars/elasticsearch-hadoop-2.4.0.jar
- 启动spark-shell,命令如下:
spark-shell --master yarn --conf spark.es.nodes=192.168.1.11 spark.es.port=9200 spark.es.index.auto.create=true
- 使用jdbc连接elasticsearch查询
- 引入maven依赖
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch</artifactId>
<version>2.4.0</version>
</dependency>
<dependency>
<groupId>org.nlpcn</groupId>
<artifactId>elasticsearch-sql</artifactId>
<version>2.4.0</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>druid</artifactId>
<version>1.0.15</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.35</version>
</dependency>
- 使用代码查询
public static void query(){
try {
Connection connection = getConnection();
String sql = "select * from bigdata/student where usertype > 5 limit 5";
PreparedStatement ps = connection.prepareStatement(sql);
ResultSet rs = ps.executeQuery();
while(rs.next()){
System.out.println(rs.getString("_id") +" "+rs.getString("recordtime")
+" "+rs.getInt("area")+" "+rs.getInt("usertype")+" "+rs.getInt("count"));
}
ps.close();
connection.close();
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 获取 ES jdbc连接
*/
public static Connection getConnection() throws Exception{
String url = "jdbc:elasticsearch://192.168.1.11:9300";
Properties properties = new Properties();
properties.put("url", url);
DruidDataSource dds = (DruidDataSource) ElasticSearchDruidDataSourceFactory.createDataSource(properties);
Connection connection = dds.getConnection();
return connection;
}
官网参考资料:Elasticsearch for Apache Hadoop