MapReduce集成

最新推荐文章于 2020-09-14 14:18:54 发布

安妮的泰迪

最新推荐文章于 2020-09-14 14:18:54 发布

阅读量186

点赞数

本文链接：https://blog.csdn.net/to_xml_xml/article/details/79459866

版权

MapReduce集成

本文档在ES与 Hadoop集群安装成功条件下，参照：

https://www.elastic.co/guide/en/elasticsearch/hadoop/5.6/mapreduce.html

版本信息：
HDP version : HDP-2.6.1.0

ES version : 5.6.0

MR version : 2.7.3

maven version:3.5

1. 依赖elasticsearch-hadoop.jar

1.1. Hadoop集群添加elasticsearch-hadoop环境

本文采用在maven项目pom文件中添加依赖方式：

<dependency>
    <groupId>org.elasticsearch</groupId>
    <artifactId>elasticsearch-hadoop</artifactId>
    <version>5.6.0</version>
</dependency>

*填坑要点：依赖pom version信息与ES version 保持一致

2. 准备数据

2.1. 把下列内容保存到blog.json中

{"id":"1","title":"git简介","posttime":"2016-06-11","content":"svn与git的最主要区别..."}
{"id":"2","title":"ava中泛型的介绍与简单使用","posttime":"2016-06-12","content":"基本操作：CRUD"}
{"id":"3","title":"SQL基本操作","posttime":"2016-06-13","content":"svn与git的最主要区别..."}
{"id":"4","title":"Hibernate框架基础","posttime":"2016-06-14","content":"Hibernate框架基础..."}
{"id":"5","title":"Shell基本知识","posttime":"2016-06-15","content":"Shell是什么..."}

2.2. 启动hadoop，上传blog.json到hdfs
hadoop fs -put blog.json /work

3. 从HDFS读取文档索引到ES

3.1. 代码如下

package com.hand.es.hadoop;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.elasticsearch.hadoop.cfg.ConfigurationOptions;
import org.elasticsearch.hadoop.mr.EsOutputFormat;

import java.io.IOException;

/**
* Created by bee on 4/1/17.
*/
public class HdfsToES {

    public static class MyMapper extends Mapper<Writable, Writable, NullWritable, Text> {
        @Override
        public void map(Writable key, Writable value, Mapper<Writable, Writable, NullWritable, Text>.Context context)
                throws IOException, InterruptedException {
            Text docVal = new Text();
            docVal.set(value.toString());
            context.write(NullWritable.get(), docVal);
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        conf.setBoolean("mapred.map.tasks.speculative.execution", false);
        conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);

        conf.set(ConfigurationOptions.ES_NODES, "10.211.55.241");
        conf.set(ConfigurationOptions.ES_PORT, "9200");
        conf.set(ConfigurationOptions.ES_INPUT_JSON, "yes");
//        conf.set(ConfigurationOptions.ES_MAPPING_ID, "id");
        conf.set(ConfigurationOptions.ES_RESOURCE, "company/input01");

        Job job = Job.getInstance(conf);
        job.setJobName("hdfs es write test");
        job.setMapperClass(HdfsToES.MyMapper.class);
        job.setJarByClass(HdfsToES.class);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(EsOutputFormat.class);

        job.setMapOutputKeyClass(NullWritable.class);
        job.setMapOutputValueClass(Text.class);

        // 设置输入路径
        FileInputFormat.setInputPaths(job, new Path
                ("hdfs://hdfs01.edcs.org:8020/work"));
        job.waitForCompletion(true);
    }
}

3.2. 代码过程解析

Map过程，按行读入，input kye的类型为Object，input value的类型为Text。输出的key为NullWritable类型，NullWritable是Writable的一个特殊类，实现方法为空实现，不从数据流中读数据，也不写入数据，只充当占位符。MapReduce中如果不需要使用键或值，就可以将键或值声明为NullWritable，这里把输出的key设置NullWritable类型。输出为Text类型，把json字符串序列化。

因为只需要写入，没有Reduce过程。在main函数中，首先创Configuration()类的一个对象conf，通过conf配置一些参数。

conf.setBoolean("mapred.map.tasks.speculative.execution", false);//关闭mapper阶段的执行推测

conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);//关闭reducer阶段的执行推测

conf.set("es.nodes", "192.168.1.111:9200");//配置Elasticsearch的IP和端口

conf.set("es.resource", "blog/csdn");//设置索引到Elasticsearch的索引名和类型名。

conf.set("es.mapping.id", "id");//设置文档id，这个参数”id”是文档中的id字段

conf.set("es.input.json", "yes");//指定输入的文件类型为json。

job.setInputFormatClass(TextInputFormat.class);/设置输入流为文本类型

job.setOutputFormatClass(EsOutputFormat.class);//设置输出为EsOutputFormat类型。

job.setMapOutputKeyClass(NullWritable.class);//设置Map的输出key类型为NullWritable类型

job.setMapOutputValueClass(Test.class);//设置Map的输出value类型为BytesWritable类型

*注意事项

1. ES插件默认写入数据api是默认接收json数据，如果es.input.json默认或者指定为flase,ES生成默认以单个map出参为一个字段，存入ES中。很多网上案例会设置job.setMapOutputValueClass(BytesWritable.class);但是在5.6.0版本中会ES会出现转换异常，400的请求错误。

2. 若json需要制定es中的ID，conf.set("es.mapping.id", "id")，”id”参数不可以包含”_id”与ES中数据字段相同的配置。

4. 从ES导出到HDFS

4.1. 代码如下：

package com.hand.es.hadoop;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.elasticsearch.hadoop.cfg.ConfigurationOptions;
import org.elasticsearch.hadoop.mr.EsInputFormat;

import java.io.IOException;

public class EsToHdfs {

    public static class ESMap extends Mapper<Writable, Writable, NullWritable, Text> {
        @Override
        public void map(Writable key, Writable value, Mapper<Writable, Writable, NullWritable, Text>.Context context)
                throws IOException, InterruptedException {
            Text docVal = new Text();
            docVal.set(value.toString());
            context.write(NullWritable.get(), docVal);
        }
    }

    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        conf.set(ConfigurationOptions.ES_NODES, "10.211.55.241");
        conf.set(ConfigurationOptions.ES_PORT, "9200");
        conf.set(ConfigurationOptions.ES_OUTPUT_JSON, "true");
        conf.set(ConfigurationOptions.ES_RESOURCE, "company/info");

        Job job = Job.getInstance(conf, EsToHdfs.class.getSimpleName());
        job.setJobName("es to hdfs test");
        // 指定自定义的Mapper阶段的任务处理类
        job.setMapperClass(ESMap.class);
        job.setJarByClass(EsToHdfs.class);
        job.setNumReduceTasks(0);
        // 设置map输出格式
        job.setMapOutputKeyClass(NullWritable.class);
        job.setMapOutputValueClass(Text.class);
        // 设置输入格式
        job.setInputFormatClass(EsInputFormat.class);
        // 设置输出路径
        FileOutputFormat.setOutputPath(job, new Path("hdfs://hdfs01.edcs.org:8020/data"));
        // 运行MR程序
        job.waitForCompletion(true);
    }
}