Hadoop MapReduce 读写Elasticsearch

1. 背景

最近需要调研hadoop MR和ES进行交互。自然就用到了ES官方的Elasticsearch-Hadoop插件。然而官方的资料,尤其是实现部分,写的感觉不太详细。跳了点坑后,总结了这篇文章,本文很大程度上是这篇官网资料的具体代码实现。
Elasticsearch-Hadoop的GIT项目地址如下:https://github.com/elastic/elasticsearch-hadoop

2. ElasticSearch-Hadoop概述

Elasticsearch-hadoop是一个深度集成Hadoop和ElasticSearch的项目,也是ES官方来维护的一个子项目,通过实现Hadoop和ES之间的输入输出,可以在Hadoop里面对ES集群的数据进行读取和写入,充分发挥Map-Reduce并行处理的优势,为Hadoop数据带来实时搜索的可能。
ES-Hadoop插件支持Map-Reduce、Cascading、Hive、Pig、Spark、Storm、yarn等组件。

3. ElasticSearch-Hadoop MR实现

环境准备

  1. 装好Hadoop

  2. 装好ElasticSearch

  3. 装好maven、eclipse

  4. 测试数据集下载:
    https://pan.baidu.com/s/1dE6mOhj

  5. 批量导入测试数据集到ElasticSearch:

    curl -XPOST ‘localhost:9200/company/info/_bulk?pretty’ --data-binary “@companys.json”

这里写图片描述

Hadoop-ES Maven 依赖如下

<dependency>
    <groupId>org.elasticsearch</groupId>
     <artifactId>elasticsearch-hadoop</artifactId>
     <version>5.0.0</version>
</dependency>

代码实现

以下为我实践的代码,实现基本功能。

MR从ES读全部数据写到HDFS
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.elasticsearch.hadoop.mr.EsInputFormat;

public class ESWriteHdfsTest {

    public static class ESMap extends Mapper<Writable, Writable, NullWritable, Text> {
        @Override
        public void map(Writable key, Writable value, Mapper<Writable, Writable, NullWritable, Text>.Context context)
                throws IOException, InterruptedException {
            // 假如我这边只是想导出数据到HDFS
            Text docVal = new Text();
            docVal.set(value.toString());
            context.write(NullWritable.get(), docVal);
        }
    }

    public static void main(String[] args) throws Exception {

        long start_time = System.currentTimeMillis();
        Configuration conf = new Configuration();
        conf.set("es.nodes", "localhost:9200");
        conf.set("es.resource", "company/info");

        Job job = Job.getInstance(conf,"hadoop elasticsearch");

        // 指定自定义的Mapper阶段的任务处理类
        job.setMapperClass(ESMap.class);
        job.setNumReduceTasks(0);
        // 设置map输出格式
        job.setMapOutputKeyClass(NullWritable.class);
        job.setMapOutputValueClass(Text.class);  
        // 设置输入格式
        job.setInputFormatClass(EsInputFormat.class);
        // 设置输出路径
        FileOutputFormat.setOutputPath(job, new Path("hdfs://localhost:9000/es_output_company _info"));
        // 运行MR程序
        job.waitForCompletion(true);
        System.out.println(System.currentTimeMillis()-start_time);
    }
}
MR从ES查询数据写到HDFS
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.elasticsearch.hadoop.mr.EsInputFormat;

public class ESQueryTest {

    public static class ESMap extends Mapper<Writable, Writable, Text, Text> {
        @Override
        public void map(Writable key, Writable value, Mapper<Writable, Writable, Text, Text>.Context context)
                throws IOException, InterruptedException {
            context.write(new Text(key.toString()), new Text(value.toString()));
        }
    }

    public static void main(String[] args) throws Exception {
        long start_time = System.currentTimeMillis();
        Configuration conf = new Configuration();
        conf.set("es.nodes", "localhost:9200");
        conf.set("es.resource", "company/info");
        conf.set("es.output.json", "true");   // 生成json格式数据?
        conf.set("es.query", "?q=name:北京");

        Job job = Job.getInstance(conf, "hadoop elasticsearch");

        // 指定自定义的Mapper阶段的任务处理类
        job.setMapperClass(ESMap.class);
        job.setNumReduceTasks(0);
        // 设置map输出格式
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        // 设置输入格式
        job.setInputFormatClass(EsInputFormat.class);
        // 设置输出路径
        FileOutputFormat.setOutputPath(job, new Path("hdfs://localhost:9000/es_output"));
        // 运行MR程序
        job.waitForCompletion(true);
        System.out.println(System.currentTimeMillis()-start_time);
    }
}
MR 从ES一个索引读数据再写到另一个索引
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.elasticsearch.hadoop.mr.EsInputFormat;
import org.elasticsearch.hadoop.mr.EsOutputFormat;
import org.elasticsearch.hadoop.mr.LinkedMapWritable;

public class WriteES2ES {

    public static class ESMap extends Mapper<Text, LinkedMapWritable, Text, LinkedMapWritable> {
        @Override
        public void map(Text key, LinkedMapWritable value,
                Mapper<Text, LinkedMapWritable, Text, LinkedMapWritable>.Context context)
                throws IOException, InterruptedException {
            context.write(key, value);
        }
    }

    public static class ESReduce extends Reducer<Text, LinkedMapWritable, Text, LinkedMapWritable> {
        @Override
        public void reduce(Text key, Iterable<LinkedMapWritable> values,
                Reducer<Text, LinkedMapWritable, Text, LinkedMapWritable>.Context context)
                throws IOException, InterruptedException {
            for (LinkedMapWritable value : values) {
                context.write(key, value);
            }
        }
    }

    public static void main(String[] args) throws Exception {
        long start_time = System.currentTimeMillis();

        Configuration conf = new Configuration();

        conf.set("es.nodes", "localhost:9200");
        conf.set("es.resource.read", "company/info");
        conf.set("es.resource.write", "company_new/info");
        conf.set("es.query", "?q=name:北京");

        Job job = Job.getInstance(conf,"hadoop elasticsearch");

        // 指定自定义的Mapper阶段的任务和reduce阶段处理任务。
        job.setMapperClass(ESMap.class);
        job.setReducerClass(ESReduce.class);

        // 设置输出格式
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LinkedMapWritable.class);

        // 设置输入格式
        job.setInputFormatClass(EsInputFormat.class);
        job.setOutputFormatClass(EsOutputFormat.class);

        // 运行MR程序
        job.waitForCompletion(true);
        System.out.println(System.currentTimeMillis()-start_time);
    }
}
把HDFS上的json格式数据导入ES
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.elasticsearch.hadoop.mr.EsOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;  

public class WriteJsonToES {

    public static class SomeMapper extends Mapper<Object, Text, NullWritable, BytesWritable> {
        private static final Logger LOG = LoggerFactory.getLogger(SomeMapper.class);  

        @Override
        public void map(Object key, Text value, Mapper<Object, Text, NullWritable, BytesWritable>.Context context) throws IOException, InterruptedException {
            byte[] source = value.toString().trim().getBytes();
            BytesWritable jsonDoc = new BytesWritable(source);
            context.write(NullWritable.get(), jsonDoc);
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        conf.setBoolean("mapred.map.tasks.speculative.execution", false);
        conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
        conf.set("es.nodes", "localhost:9200");
        conf.set("es.resource", "index_1/test");
        conf.set("es.input.json", "yes"); 

        Job job = Job.getInstance(conf,"hadoop es write test");
        job.setMapperClass(SomeMapper.class);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(EsOutputFormat.class);

        job.setMapOutputKeyClass(NullWritable.class);
        job.setMapOutputValueClass(BytesWritable.class);

        // 设置输入路径
        FileInputFormat.setInputPaths(job, new Path("hdfs://localhost:9000/input_json"));

        job.waitForCompletion(true);
    }
}

4. 参考资料

Hadoop MR-ElasticSearch资料
http://www.cnblogs.com/kaisne/p/3930677.html
http://jingyan.baidu.com/article/7e440953386e0a2fc0e2efe0.html
http://bigbo.github.io/pages/2015/02/28/elasticsearch_hadoop/

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值