从ES中读取数据,并将数据输出到本地(本地执行ES)
本程序总共需要两个文件,一个是job执行文件,另一个是对读取的数据进行处理的mapper文件。
因程序是maven程序,需要有一个pom文件,pom文件如下:
pom文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>ES2hadoop</artifactId>
<version>1.0-SNAPSHOT</version>
<repositories>
<repository>
<id>huaweicloudsdk</id>
<url>https://mirrors.huaweicloud.com/repository/maven/huaweicloudsdk/</url>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>true</enabled>
</snapshots>
</repository>
<!-- 阿里云 -->
<repository>
<id>alimaven</id>
<name>aliyun maven</name>
<url>https://maven.aliyun.com/repository/public</url>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>false</enabled>
</snapshots>
</repository>
</repositories>
<properties>
<lombok.version>1.18.14</lombok.version>
<java.version>1.8</java.version>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.1.1-hw-ei-312005</version>
<scope>provided</scope>
</dependency>
<!-- <dependency>-->
<!-- <groupId>org.projectlombok</groupId>-->
<!-- <artifactId>lombok</artifactId>-->
<!-- <version>${lombok.version}</version>-->
<!-- </dependency>-->
<!-- <!– Hive Dependency –>-->
<!-- <dependency>-->
<!-- <groupId>org.apache.hive</groupId>-->
<!-- <artifactId>hive-exec</artifactId>-->
<!-- <version>2.3.4</version>-->
<!-- <!– <scope>provided</scope>–>-->
<!-- </dependency>-->
<!-- <dependency>-->
<!-- <groupId>org.apache.hive</groupId>-->
<!-- <artifactId>hive-serde</artifactId>-->
<!-- <version>3.1.0-hw-ei-312005</version>-->
<!-- <scope>provided</scope>-->
<!-- </dependency>-->
<!-- <dependency>-->
<!-- <groupId>org.apache.hive.hcatalog</groupId>-->
<!-- <artifactId>hive-hcatalog-core</artifactId>-->
<!-- <version>3.1.0-hw-ei-312005</version>-->
<!-- <!– <scope>provided</scope>–>-->
<!-- </dependency>-->
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch-hadoop</artifactId>
<version>7.16.2</version>
<!-- <version>5.5.2</version>-->
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.28</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>3.1.1-hw-ei-312005</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>3.1.1-hw-ei-312005</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.2.4</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<artifactSet>
<excludes>
<exclude>com.google.code.findbugs:jsr305</exclude>
<exclude>org.slf4j:*</exclude>
<exclude>log4j:*</exclude>
<exclude>org.apache.hadoop:*</exclude>
</excludes>
</artifactSet>
<filters>
<filter>
<!-- Do not copy the signatures in the META-INF folder.
Otherwise, this might cause SecurityExceptions when using the JAR. -->
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
<transformers combine.children="append">
<transformer
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer">
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
mapper程序如下:
package com.es_hadoop_example;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.elasticsearch.hadoop.mr.LinkedMapWritable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
class E2HMapper extends Mapper<Text, LinkedMapWritable, Text, LinkedMapWritable> {
private static final Logger LOG = LoggerFactory.getLogger(E2HMapper.class);
@Override
protected void setup(Context context) throws IOException, InterruptedException {
super.setup(context);
}
/**
* key :ES中document的id的值
* value: 包含的就是一个document的内容
*/
@Override
protected void map(Text key, LinkedMapWritable value, Context context)
throws IOException, InterruptedException {
LOG.info("key {} value {}", key, value);
context.write(key, value); //数据不做任何处理,直接输出
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
super.cleanup(context);
}
}
job执行文件
package com.es_hadoop_example;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.elasticsearch.hadoop.mr.EsInputFormat;
import org.elasticsearch.hadoop.mr.LinkedMapWritable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* @Description: 实现从es读取数据,并将读取的数据存放在本地,目前不对读取的数据做任何处理
* @Param:
* @return:
* @Author: zhangjiwei
* @Date: -
*/
public class E2HJob {
private static Logger LOG = LoggerFactory.getLogger(E2HJob.class);
public static void main(String[] args) {
try {
Configuration conf = new Configuration();
//禁止speculative(推测执行)机制,该机制会启动多个相同task,使数据重复索引
conf.setBoolean("mapreduce.map.speculative", false);
conf.setBoolean("mapreduce.reduce.speculative", false);
conf.set("es.nodes", "127.0.0.1:9200"); //ElasticSearch节点
conf.set("es.resource", "ecommerce/product"); //ElaticSearch source: Index/Type
// conf.set("es.resource", "user/_doc");
// conf.set("es.resource", "kibana_sample_data_ecommerce/_doc"); //无法执行成功???
/**
* es.nodes.wan.only (默认为假)
* 连接器是否用于 WAN 上云/受限环境中的 Elasticsearch 实例,例如 Amazon Web Services。
* 在这种模式下,连接器禁用发现,并且仅通过在所有操作期间声明的连接es.nodes,包括读取和写入。
* 请注意,在此模式下,性能受到很大 影响
*/
conf.set("es.nodes.wan.only","true"); // 禁用网络中其他节点的自动发现.强制系统使用“es.nodes”属性,默认情况下会尝试连接到 localhost.
/** 调试模式中可以看到下面这行代码会发生异常(Method threw 'java.lang.IllegalStateException' exception. Cannot evaluate org.apache.hadoop.mapreduce.Job.toString())
* 这是因为job还未提交,但是该方法试图调用job.toString().
* 不过没有关系,系统会自动处理这个异常
*/
Job job = Job.getInstance(conf, "JOBE2H"); //构建job对象
job.setJarByClass(E2HJob.class); //指定jar包运行主类
job.setInputFormatClass(EsInputFormat.class); //指定输入格式的类
job.setMapperClass(E2HMapper.class); //指定map类
job.setMapOutputKeyClass(Text.class); //指定map输出的 key的类
job.setMapOutputValueClass(LinkedMapWritable.class); //指定map输出的 value的类
FileSystem fs = FileSystem.get(conf);
Path outPath =new Path("D:\\test\\es_data");
if(fs.exists(outPath)) {
fs.delete(outPath, true);
}
FileOutputFormat.setOutputPath(job, outPath); //指定输出路径
System.out.println(job.waitForCompletion(true));//打印执行结果,结果为true表明执行成功
} catch (Exception e) {
LOG.error(e.getMessage(), e);
}
}
}