DataX二次开发——(9)新增s3reader和s3writer模块

1 背景

DataX3.0支持阿里的OSS的读写,但没支持S3的读写,虽然OSS的也是基于S3协议去做二开的,但是一些参数有点区别,所以参考阿里的OSSReader和OSSWriter的设计开发了S3Reader和S3Writer。

2 代码开发

2.1 s3reader

2.1.1 项目结构

 2.1.2 代码如下

package.xml

<assembly
        xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
        xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
    <formats>
        <format>dir</format>
    </formats>
    <includeBaseDirectory>false</includeBaseDirectory>
    <fileSets>
        <fileSet>
            <directory>src/main/resources</directory>
            <includes>
                <include>plugin.json</include>
                <include>plugin_job_template.json</include>
            </includes>
            <outputDirectory>plugin/reader/s3reader</outputDirectory>
        </fileSet>
        <fileSet>
            <directory>target/</directory>
            <includes>
                <include>s3reader-0.0.1-SNAPSHOT.jar</include>
            </includes>
            <outputDirectory>plugin/reader/s3reader</outputDirectory>
        </fileSet>
    </fileSets>

    <dependencySets>
        <dependencySet>
            <useProjectArtifact>false</useProjectArtifact>
            <outputDirectory>plugin/reader/s3reader/libs</outputDirectory>
            <scope>runtime</scope>
        </dependencySet>
    </dependencySets>
</assembly>

Constant.java

package com.alibaba.datax.plugin.reader.s3reader;

public class Constant {
    public static final String OBJECT = "object";
    public static final int CONNECT_TIMEOUT = 50000;
    public static final String SIGNER_OVERRIDE = "S3SignerType";
}

 Key.java

public class Key {
    public static final String ENDPOINT = "endpoint";

    public static final String ACCESSKEY = "accessKey";

    public static final String SECRETKEY = "secretKey";

    public static final String BUCKET = "bucket";

    public static final String OBJECT = "object";
}

S3Reader.java

package com.alibaba.datax.plugin.reader.s3reader;

import com.alibaba.datax.common.exception.DataXException;
import com.alibaba.datax.common.plugin.RecordSender;
import com.alibaba.datax.common.spi.Reader;
import com.alibaba.datax.common.util.Configuration;
import com.alibaba.datax.plugin.unstructuredstorage.reader.UnstructuredStorageReaderUtil;
import com.amazonaws.services.s3.AmazonS3;
import com.amazonaws.services.s3.model.ListObjectsRequest;
import com.amazonaws.services.s3.model.ObjectListing;
import com.amazonaws.services.s3.model.S3Object;
import com.amazonaws.services.s3.model.S3ObjectSummary;
import com.google.common.collect.Sets;
import org.apache.commons.io.Charsets;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.InputStream;
import java.nio.charset.UnsupportedCharsetException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;


public class S3Reader extends Reader {
    public static class Job extends Reader.Job {
        private static final Logger LOG = LoggerFactory.getLogger(Job.class);

        private Configuration readerOriginConfig = null;

        @Override
        public void init() {
            LOG.debug("init() begin...");
            this.readerOriginConfig = this.getPluginJobConf();
            this.validate();
            LOG.debug("init() ok and end...");
        }

        private void validate() {
            String endpoint = this.readerOriginConfig.getString(Key.ENDPOINT);
            if (StringUtils.isBlank(endpoint)) {
                throw DataXException.asDataXException(
                        S3ReaderErrorCode.CONFIG_INVALID_EXCEPTION,
                        "您需要指定 endpoint");
            }

            String secretKey = this.readerOriginConfig.getString(Key.SECRETKEY);
            if (StringUtils.isBlank(secretKey)) {
                throw DataXException.asDataXException(
                        S3ReaderErrorCode.CONFIG_INVALID_EXCEPTION,
                        "您需要指定 secretKey");
            }

            String accessKey = this.readerOriginConfig.getString(Key.ACCESSKEY);
            if (StringUtils.isBlank(accessKey)) {
                throw DataXException.asDataXException(
                        S3ReaderErrorCode.CONFIG_INVALID_EXCEPTION,
                        "您需要指定 accessKey");
            }

            String bucket = this.readerOriginConfig.getString(Key.BUCKET);
            if (StringUtils.isBlank(bucket)) {
                throw DataXException.asDataXException(
                        S3ReaderErrorCode.CONFIG_INVALID_EXCEPTION,
                        "您需要指定 bucket");
            }

            String object = this.readerOriginConfig.getString(Key.OBJECT);
            if (StringUtils.isBlank(object)) {
                throw DataXException.asDataXException(
                        S3ReaderErrorCode.CONFIG_INVALID_EXCEPTION,
                        "您需要指定 object");
            }

            String fieldDelimiter = this.readerOriginConfig
                    .getString(com.alibaba.datax.plugin.unstructuredstorage.reader.Key.FIELD_DELIMITER);
            // warn: need length 1
            if (null == fieldDelimiter || fieldDelimiter.length() == 0) {
                throw DataXException.asDataXException(
                        S3ReaderErrorCode.CONFIG_INVALID_EXCEPTION,
                        "您需要指定 fieldDelimiter");
            }

            String encoding = this.readerOriginConfig
                    .getString(
                            com.alibaba.datax.plugin.unstructuredstorage.reader.Key.ENCODING,
                            com.alibaba.datax.plugin.unstructuredstorage.reader.Constant.DEFAULT_ENCODING);
            try {
                Charsets.toCharset(encoding);
            } catch (UnsupportedCharsetException uce) {
                throw DataXException.asDataXException(
                        S3ReaderErrorCode.ILLEGAL_VALUE,
                        String.format("不支持的编码格式 : [%s]", encoding), uce);
            } catch (Exception e) {
                throw DataXException.asDataXException(
                        S3ReaderErrorCode.ILLEGAL_VALUE,
                        String.format("运行配置异常 : %s", e.getMessage()), e);
            }

            // 检测是column 是否为 ["*"] 若是则填为空
            List<Configuration> column = this.readerOriginConfig
                    .getListConfiguration(com.alibaba.datax.plugin.unstructuredstorage.reader.Key.COLUMN);
            if (null != column
                    && 1 == column.size()
                    && ("\"*\"".equals(column.get(0).toString()) || "'*'"
                    .equals(column.get(0).toString()))) {
                readerOriginConfig
                        .set(com.alibaba.datax.plugin.unstructuredstorage.reader.Key.COLUMN,
                                new ArrayList<String>());
            } else {
                // column: 1. index type 2.value type 3.when type is Data, may
                // have
                // format
                List<Configuration> columns = this.readerOriginConfig
                        .getListConfiguration(com.alibaba.datax.plugin.unstructuredstorage.reader.Key.COLUMN);

                if (null == columns || columns.size() == 0) {
                    throw DataXException.asDataXException(
                            S3ReaderErrorCode.CONFIG_INVALID_EXCEPTION,
                            "您需要指定 columns");
                }

                if (null != columns && columns.size() != 0) {
                    for (Configuration eachColumnConf : columns) {
                        eachColumnConf
                                .getNecessaryValue(
                                        com.alibaba.datax.plugin.unstructuredstorage.reader.Key.TYPE,
                                        S3ReaderErrorCode.REQUIRED_VALUE);
                        Integer columnIndex = eachColumnConf
                                .getInt(com.alibaba.datax.plugin.unstructuredstorage.reader.Key.INDEX);
                        String columnValue = eachColumnConf
                                .getString(com.alibaba.datax.plugin.unstructuredstorage.reader.Key.VALUE);

                        if (null == columnIndex && null == columnValue) {
                            throw DataXException.asDataXException(
                                    S3ReaderErrorCode.NO_INDEX_VALUE,
                                    "由于您配置了type, 则至少需要配置 index 或 value");
                        }

                        if (null != columnIndex && null != columnValue) {
                            throw DataXException.asDataXException(
                                    S3ReaderErrorCode.MIXED_INDEX_VALUE,
                                    "您混合配置了index, value, 每一列同时仅能选择其中一种");
                        }

                    }
                }
            }

            // only support compress: gzip,bzip2,zip
            String compress = this.readerOriginConfig.getString(com.alibaba.datax.plugin.unstructuredstorage.reader.Key.COMPRESS);
            if (StringUtils.isBlank(compress)) {
                this.readerOriginConfig.set(com.alibaba.datax.plugin.unstructuredstorage.reader.Key.COMPRESS, null);
            } else {
                Set<String> supportedCompress = Sets.newHashSet("gzip", "bzip2", "zip");
                compress = compress.toLowerCase().trim();
                if (!supportedCompress.contains(compress)) {
                    throw DataXException.asDataXException(
                            S3ReaderErrorCode.ILLEGAL_VALUE,
                            String.format(
                                    "仅支持 gzip, bzip2, zip 文件压缩格式 , 不支持您配置的文件压缩格式: [%s]",
                                    compress));
                }
                this.readerOriginConfig
                        .set(com.alibaba.datax.plugin.unstructuredstorage.reader.Key.COMPRESS,
                                compress);
            }
        }

        @Override
        public void prepare() {
            LOG.debug("prepare()");
        }

        @Override
        public void post() {
            LOG.debug("post()");
        }

        @Override
        public void destroy() {
            LOG.debug("destroy()");
        }

        @Override
        public List<Configuration> split(int adviceNumber) {
            LOG.debug("split() begin...");
            List<Configuration> readerSplitConfigs = new ArrayList<Configuration>();

            // 将每个单独的 object 作为一个 slice
            List<String> objects = parseOriginObjects(readerOriginConfig
                    .getList(Constant.OBJECT, String.class));
            if (0 == objects.size()) {
                throw DataXException.asDataXException(
                        S3ReaderErrorCode.EMPTY_BUCKET_EXCEPTION,
                        String.format(
                                "未能找到待读取的Object,请确认您的配置项bucket: %s object: %s",
                                this.readerOriginConfig.get(Key.BUCKET),
                                this.readerOriginConfig.get(Key.OBJECT)));
            }

            for (String object : objects) {
                Configuration splitedConfig = this.readerOriginConfig.clone();
                splitedConfig.set(Constant.OBJECT, object);
                readerSplitConfigs.add(splitedConfig);
                LOG.info(String.format("S3 object to be read:%s", object));
            }
            LOG.debug("split() ok and end...");
            return readerSplitConfigs;
        }

        private List<String> parseOriginObjects(List<String> originObjects) {
            List<String> parsedObjects = new ArrayList<String>();

            for (String object : originObjects) {
                int firstMetaChar = (object.indexOf('*') > object.indexOf('?')) ? object
                        .indexOf('*') : object.indexOf('?');

                if (firstMetaChar != -1) {
                    int lastDirSeparator = object.lastIndexOf(
                            IOUtils.DIR_SEPARATOR, firstMetaChar);
                    String parentDir = object
                            .substring(0, lastDirSeparator + 1);
                    List<String> remoteObjects = getRemoteObjects(parentDir);
                    Pattern pattern = Pattern.compile(object.replace("*", ".*")
                            .replace("?", ".?"));

                    for (String remoteObject : remoteObjects) {
                        if (pattern.matcher(remoteObject).matches()) {
                            parsedObjects.add(remoteObject);
                        }
                    }
                } else {
                    parsedObjects.add(object);
                }
            }
            return parsedObjects;
        }

        private List<String> getRemoteObjects(String parentDir) {

            LOG.debug(String.format("父文件夹 : %s", parentDir));
            List<String> remoteObjects = new ArrayList<String>();
            AmazonS3 client = S3Util.initS3Client(readerOriginConfig);
            try {
                ListObjectsRequest listObjectsRequest = new ListObjectsRequest(
                        readerOriginConfig.getString(Key.BUCKET), parentDir, null, null, null
                );
                ObjectListing objectList;
                do {
                    objectList = client.listObjects(listObjectsRequest);
                    for (S3ObjectSummary objectSummary : objectList
                            .getObjectSummaries()) {
                        LOG.debug(String.format("找到文件 : %s",
                                objectSummary.getKey()));
                        remoteObjects.add(objectSummary.getKey());
                    }
                    listObjectsRequest.setMarker(objectList.getNextMarker());
                    LOG.debug(listObjectsRequest.getMarker());
                    LOG.debug(String.valueOf(objectList.isTruncated()));

                } while (objectList.isTruncated());
            } catch (IllegalArgumentException e) {
                throw DataXException.asDataXException(
                        S3ReaderErrorCode.S3_EXCEPTION, e.getMessage());
            }

            return remoteObjects;
        }
    }

    public static class Task extends Reader.Task {
        private static Logger LOG = LoggerFactory.getLogger(Reader.Task.class);

        private Configuration readerSliceConfig;

        @Override
        public void startRead(RecordSender recordSender) {
            LOG.debug("read start");
            String bucket = readerSliceConfig.getString(Key.BUCKET);
            String object = readerSliceConfig.getString(Key.OBJECT);
            AmazonS3 client = S3Util.initS3Client(readerSliceConfig);

            S3Object s3Object = client.getObject(bucket, object);
            InputStream objectStream = s3Object.getObjectContent();
            UnstructuredStorageReaderUtil.readFromStream(objectStream, object,
                    this.readerSliceConfig, recordSender,
                    this.getTaskPluginCollector());
            recordSender.flush();
        }

        @Override
        public void init() {
            this.readerSliceConfig = this.getPluginJobConf();
        }

        @Override
        public void destroy() {

        }
    }
}

S3ReaderErrorCode.java

package com.alibaba.datax.plugin.reader.s3reader;

import com.alibaba.datax.common.spi.ErrorCode;

public enum S3ReaderErrorCode implements ErrorCode {
    // TODO: 修改错误码类型
    RUNTIME_EXCEPTION("S3Reader-00", "运行时异常"),
    S3_EXCEPTION("OssFileReader-01", "OSS配置异常"),
    CONFIG_INVALID_EXCEPTION("OssFileReader-02", "参数配置错误"),
    NOT_SUPPORT_TYPE("S3Reader-03", "不支持的类型"),
    CAST_VALUE_TYPE_ERROR("OssFileReader-04", "无法完成指定类型的转换"),
    SECURITY_EXCEPTION("S3Reader-05", "缺少权限"),
    ILLEGAL_VALUE("S3Reader-06", "值错误"),
    REQUIRED_VALUE("S3Reader-07", "必选项"),
    NO_INDEX_VALUE("S3Reader-08", "没有 Index"),
    MIXED_INDEX_VALUE("S3Reader-09", "index 和 value 混合"),
    EMPTY_BUCKET_EXCEPTION("S3Reader-10", "您尝试读取的Bucket为空");

    private final String code;
    private final String description;

    private S3ReaderErrorCode(String code, String description) {
        this.code = code;
        this.description = description;
    }

    @Override
    public String getCode() {
        return this.code;
    }

    @Override
    public String getDescription() {
        return this.description;
    }

    @Override
    public String toString() {
        return String.format("Code:[%s], Description:[%s].", this.code,
                this.description);
    }
}

 S3Util.java

package com.alibaba.datax.plugin.reader.s3reader;

import com.alibaba.datax.common.util.Configuration;
import com.amazonaws.ClientConfiguration;
import com.amazonaws.auth.AWSCredentials;
import com.amazonaws.auth.AWSCredentialsProvider;
import com.amazonaws.auth.AWSStaticCredentialsProvider;
import com.amazonaws.auth.BasicAWSCredentials;
import com.amazonaws.client.builder.AwsClientBuilder;
import com.amazonaws.services.s3.AmazonS3;
import com.amazonaws.services.s3.AmazonS3ClientBuilder;

import java.util.HashMap;
import java.util.Map;

/**
 * @Description: obs 工具类
 * @Author: chenweifeng
 * @Date: 2022年11月18日 下午1:56
 **/
public class S3Util {
    public static void main(String[] args) {
        Map<String, Object> map = new HashMap<>();
        map.put(Key.ENDPOINT, "http://x.x.x.x:9383");
        map.put(Key.ACCESSKEY, "admin");
        map.put(Key.SECRETKEY, "123456");
        Configuration conf = Configuration.from(map);
        initS3Client(conf);
    }

    /**
     * 初始化S3Client
     *
     * @param conf
     * @return
     */
    public static AmazonS3 initS3Client(Configuration conf) {
        String endpoint = conf.getString(Key.ENDPOINT);
        String accessKey = conf.getString(Key.ACCESSKEY);
        String secretKey = conf.getString(Key.SECRETKEY);

        AmazonS3ClientBuilder amazonS3ClientBuilder = AmazonS3ClientBuilder.standard();

        ClientConfiguration clientConfiguration = new ClientConfiguration();
        clientConfiguration.setProtocol(com.amazonaws.Protocol.HTTPS);
        clientConfiguration.setConnectionTimeout(Constant.CONNECT_TIMEOUT);
        clientConfiguration.setSignerOverride(Constant.SIGNER_OVERRIDE);
        amazonS3ClientBuilder.setClientConfiguration(clientConfiguration);

        AWSCredentials awsCredentials = new BasicAWSCredentials(accessKey, secretKey);
        AWSCredentialsProvider awsCredentialsProvider = new AWSStaticCredentialsProvider(awsCredentials);
        amazonS3ClientBuilder.setCredentials(awsCredentialsProvider);

        AwsClientBuilder.EndpointConfiguration endpointConfiguration = new AwsClientBuilder.EndpointConfiguration(endpoint, null);
        amazonS3ClientBuilder.setEndpointConfiguration(endpointConfiguration);

        AmazonS3 amazonS3 = amazonS3ClientBuilder.build();
        return amazonS3;
    }
}

plugin.json

{
    "name": "s3reader",
    "class": "com.alibaba.datax.plugin.reader.s3reader.S3Reader",
    "description": "",
    "developer": "alibaba"
}

2.2 s3writer

2.2.1 项目结构

 2.2.2 代码开发

package.xml

<assembly
        xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
        xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
    <formats>
        <format>dir</format>
    </formats>
    <includeBaseDirectory>false</includeBaseDirectory>
    <fileSets>
        <fileSet>
            <directory>src/main/resources</directory>
            <includes>
                <include>plugin.json</include>
                <include>plugin_job_template.json</include>
            </includes>
            <outputDirectory>plugin/writer/s3writer</outputDirectory>
        </fileSet>
        <fileSet>
            <directory>target/</directory>
            <includes>
                <include>s3writer-0.0.1-SNAPSHOT.jar</include>
            </includes>
            <outputDirectory>plugin/writer/s3writer</outputDirectory>
        </fileSet>
    </fileSets>

    <dependencySets>
        <dependencySet>
            <useProjectArtifact>false</useProjectArtifact>
            <outputDirectory>plugin/writer/s3writer/libs</outputDirectory>
            <scope>runtime</scope>
        </dependencySet>
    </dependencySets>
</assembly>

Constant.java

package com.alibaba.datax.plugin.writer.s3writer;


public class Constant {
    public static final String OBJECT = "object";
    public static final int CONNECT_TIMEOUT = 50000;
    public static final String SIGNER_OVERRIDE = "S3SignerType";
}

Key.java

package com.alibaba.datax.plugin.writer.s3writer;

public class Key {
    public static final String ENDPOINT = "endpoint";

    public static final String ACCESSKEY = "accessKey";

    public static final String SECRETKEY = "secretKey";

    public static final String BUCKET = "bucket";

    public static final String OBJECT = "object";

}

S3Util.java 如上

S3Writer.java

package com.alibaba.datax.plugin.writer.s3writer;

import com.alibaba.datax.common.element.Record;
import com.alibaba.datax.common.exception.DataXException;
import com.alibaba.datax.common.plugin.RecordReceiver;
import com.alibaba.datax.common.spi.Writer;
import com.alibaba.datax.common.util.Configuration;
import com.alibaba.datax.common.util.RetryUtil;
import com.alibaba.datax.plugin.unstructuredstorage.writer.TextCsvWriterManager;
import com.alibaba.datax.plugin.unstructuredstorage.writer.UnstructuredStorageWriterUtil;
import com.alibaba.datax.plugin.unstructuredstorage.writer.UnstructuredWriter;
import com.amazonaws.services.s3.AmazonS3;
import com.amazonaws.services.s3.model.CompleteMultipartUploadRequest;
import com.amazonaws.services.s3.model.CompleteMultipartUploadResult;
import com.amazonaws.services.s3.model.InitiateMultipartUploadRequest;
import com.amazonaws.services.s3.model.InitiateMultipartUploadResult;
import com.amazonaws.services.s3.model.ObjectListing;
import com.amazonaws.services.s3.model.PartETag;
import com.amazonaws.services.s3.model.S3ObjectSummary;
import com.amazonaws.services.s3.model.UploadPartRequest;
import com.amazonaws.services.s3.model.UploadPartResult;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.Callable;


public class S3Writer extends Writer {
    public static class Job extends Writer.Job {
        private static final Logger LOG = LoggerFactory.getLogger(Job.class);

        private Configuration writerSliceConfig = null;
        private AmazonS3 s3client = null;

        @Override
        public void init() {
            this.writerSliceConfig = this.getPluginJobConf();
            this.validateParameter();
            this.s3client = S3Util.initS3Client(this.writerSliceConfig);
        }

        private void validateParameter() {
            this.writerSliceConfig.getNecessaryValue(Key.ENDPOINT,
                    S3WriterErrorCode.REQUIRED_VALUE);
            this.writerSliceConfig.getNecessaryValue(Key.SECRETKEY,
                    S3WriterErrorCode.REQUIRED_VALUE);
            this.writerSliceConfig.getNecessaryValue(Key.ACCESSKEY,
                    S3WriterErrorCode.REQUIRED_VALUE);
            this.writerSliceConfig.getNecessaryValue(Key.BUCKET,
                    S3WriterErrorCode.REQUIRED_VALUE);
            this.writerSliceConfig.getNecessaryValue(Key.OBJECT,
                    S3WriterErrorCode.REQUIRED_VALUE);

            // warn: do not support compress!!
            String compress = this.writerSliceConfig.getString(com.alibaba.datax.plugin.unstructuredstorage.writer.Key.COMPRESS);
            if (StringUtils.isNotBlank(compress)) {
                String errorMessage = String.format("S3写暂时不支持压缩, 该压缩配置项[%s]不起效用", compress);
                LOG.error(errorMessage);
                throw DataXException.asDataXException(S3WriterErrorCode.ILLEGAL_VALUE, errorMessage);
            }

            UnstructuredStorageWriterUtil.validateParameter(this.writerSliceConfig);
        }

        @Override
        public void prepare() {
            LOG.info("begin do prepare...");
            String bucket = this.writerSliceConfig.getString(Key.BUCKET);
            String object = this.writerSliceConfig.getString(Key.OBJECT);
            String writeMode = this.writerSliceConfig
                    .getString(com.alibaba.datax.plugin.unstructuredstorage.writer.Key.WRITE_MODE);
            // warn: bucket is not exists, create it
            try {
                // warn: do not create bucket for user
                if (!this.s3client.doesBucketExist(bucket)) {
                    // this.s3client.createBucket(bucket);
                    String errorMessage = String.format(
                            "您配置的bucket [%s] 不存在, 请您确认您的配置项.", bucket);
                    LOG.error(errorMessage);
                    throw DataXException.asDataXException(
                            S3WriterErrorCode.ILLEGAL_VALUE, errorMessage);
                }
                LOG.info(String.format("access control details [%s].",
                        this.s3client.getBucketAcl(bucket).toString()));

                // truncate option handler
                if ("truncate".equals(writeMode)) {
                    LOG.info(String
                            .format("由于您配置了writeMode truncate, 开始清理 [%s] 下面以 [%s] 开头的Object",
                                    bucket, object));
                    // warn: 默认情况下,如果Bucket中的Object数量大于100,则只会返回100个Object
                    while (true) {
                        ObjectListing listing = null;
                        LOG.info("list objects with listObject(bucket, object)");
                        listing = this.s3client.listObjects(bucket, object);
                        List<S3ObjectSummary> objectSummarys = listing.getObjectSummaries();
                        for (S3ObjectSummary objectSummary : objectSummarys) {
                            LOG.info(String.format("delete oss object [%s].", objectSummary.getKey()));
                            this.s3client.deleteObject(bucket, objectSummary.getKey());
                        }
                        if (objectSummarys.isEmpty()) {
                            break;
                        }
                    }
                } else if ("append".equals(writeMode)) {
                    LOG.info(String
                            .format("由于您配置了writeMode append, 写入前不做清理工作, 数据写入Bucket [%s] 下, 写入相应Object的前缀为  [%s]",
                                    bucket, object));
                } else if ("nonConflict".equals(writeMode)) {
                    LOG.info(String
                            .format("由于您配置了writeMode nonConflict, 开始检查Bucket [%s] 下面以 [%s] 命名开头的Object",
                                    bucket, object));
                    ObjectListing listing = this.s3client.listObjects(bucket,
                            object);
                    if (0 < listing.getObjectSummaries().size()) {
                        StringBuilder objectKeys = new StringBuilder();
                        objectKeys.append("[ ");
                        for (S3ObjectSummary ossObjectSummary : listing.getObjectSummaries()) {
                            objectKeys.append(ossObjectSummary.getKey() + " ,");
                        }
                        objectKeys.append(" ]");
                        LOG.info(String.format("object with prefix [%s] details: %s", object, objectKeys.toString()));
                        throw DataXException.asDataXException(
                                S3WriterErrorCode.ILLEGAL_VALUE,
                                String.format("您配置的Bucket: [%s] 下面存在其Object有前缀 [%s].", bucket, object));
                    }
                }
            } catch (Exception e) {
                throw DataXException.asDataXException(
                        S3WriterErrorCode.S3_COMM_ERROR, e.getMessage());
            }
        }

        @Override
        public void post() {

        }

        @Override
        public void destroy() {

        }

        @Override
        public List<Configuration> split(int mandatoryNumber) {
            LOG.info("begin do split...");
            List<Configuration> writerSplitConfigs = new ArrayList<Configuration>();
            String object = this.writerSliceConfig.getString(Key.OBJECT);
            String bucket = this.writerSliceConfig.getString(Key.BUCKET);

            Set<String> allObjects = new HashSet<String>();
            try {
                List<S3ObjectSummary> ossObjectlisting = this.s3client.listObjects(bucket).getObjectSummaries();
                for (S3ObjectSummary objectSummary : ossObjectlisting) {
                    allObjects.add(objectSummary.getKey());
                }
            } catch (Exception e) {
                throw DataXException.asDataXException(
                        S3WriterErrorCode.S3_COMM_ERROR, e.getMessage());
            }

            String objectSuffix;
            for (int i = 0; i < mandatoryNumber; i++) {
                // handle same object name
                Configuration splitedTaskConfig = this.writerSliceConfig
                        .clone();

                String fullObjectName = null;
                objectSuffix = StringUtils.replace(
                        UUID.randomUUID().toString(), "-", "");
                fullObjectName = String.format("%s__%s", object, objectSuffix);
                while (allObjects.contains(fullObjectName)) {
                    objectSuffix = StringUtils.replace(UUID.randomUUID()
                            .toString(), "-", "");
                    fullObjectName = String.format("%s__%s", object,
                            objectSuffix);
                }
                allObjects.add(fullObjectName);

                splitedTaskConfig.set(Key.OBJECT, fullObjectName);

                LOG.info(String.format("splited write object name:[%s]",
                        fullObjectName));

                writerSplitConfigs.add(splitedTaskConfig);
            }
            LOG.info("end do split.");
            return writerSplitConfigs;
        }
    }

    public static class Task extends Writer.Task {
        private static final Logger LOG = LoggerFactory.getLogger(Task.class);

        private AmazonS3 s3client;
        private Configuration writerSliceConfig;
        private String bucket;
        private String object;
        private String nullFormat;
        private String encoding;
        private char fieldDelimiter;
        private String dateFormat;
        private DateFormat dateParse;
        private String fileFormat;
        private List<String> header;
        private Long maxFileSize;// MB
        private String suffix;

        @Override
        public void init() {
            this.writerSliceConfig = this.getPluginJobConf();
            this.s3client = S3Util.initS3Client(this.writerSliceConfig);
            this.bucket = this.writerSliceConfig.getString(Key.BUCKET);
            this.object = this.writerSliceConfig.getString(Key.OBJECT);
            this.nullFormat = this.writerSliceConfig
                    .getString(com.alibaba.datax.plugin.unstructuredstorage.writer.Key.NULL_FORMAT);
            this.dateFormat = this.writerSliceConfig
                    .getString(
                            com.alibaba.datax.plugin.unstructuredstorage.writer.Key.DATE_FORMAT,
                            null);
            if (StringUtils.isNotBlank(this.dateFormat)) {
                this.dateParse = new SimpleDateFormat(dateFormat);
            }
            this.encoding = this.writerSliceConfig
                    .getString(
                            com.alibaba.datax.plugin.unstructuredstorage.writer.Key.ENCODING,
                            com.alibaba.datax.plugin.unstructuredstorage.writer.Constant.DEFAULT_ENCODING);
            this.fieldDelimiter = this.writerSliceConfig
                    .getChar(
                            com.alibaba.datax.plugin.unstructuredstorage.writer.Key.FIELD_DELIMITER,
                            com.alibaba.datax.plugin.unstructuredstorage.writer.Constant.DEFAULT_FIELD_DELIMITER);
            this.fileFormat = this.writerSliceConfig
                    .getString(
                            com.alibaba.datax.plugin.unstructuredstorage.writer.Key.FILE_FORMAT,
                            com.alibaba.datax.plugin.unstructuredstorage.writer.Constant.FILE_FORMAT_TEXT);
            this.header = this.writerSliceConfig
                    .getList(
                            com.alibaba.datax.plugin.unstructuredstorage.writer.Key.HEADER,
                            null, String.class);
            this.maxFileSize = this.writerSliceConfig
                    .getLong(
                            com.alibaba.datax.plugin.unstructuredstorage.writer.Key.MAX_FILE_SIZE,
                            com.alibaba.datax.plugin.unstructuredstorage.writer.Constant.MAX_FILE_SIZE);
            this.suffix = this.writerSliceConfig
                    .getString(
                            com.alibaba.datax.plugin.unstructuredstorage.writer.Key.SUFFIX,
                            com.alibaba.datax.plugin.unstructuredstorage.writer.Constant.DEFAULT_SUFFIX);
            this.suffix = this.suffix.trim();// warn: need trim
        }

        @Override
        public void startWrite(RecordReceiver lineReceiver) {
            // 设置每块字符串长度
            final long partSize = 1024 * 1024 * 10L;
            long numberCacul = (this.maxFileSize * 1024 * 1024L) / partSize;
            final long maxPartNumber = numberCacul >= 1 ? numberCacul : 1;
            int objectRollingNumber = 0;
            //warn: may be StringBuffer->StringBuilder
            StringWriter sw = new StringWriter();
            StringBuffer sb = sw.getBuffer();
            UnstructuredWriter unstructuredWriter = TextCsvWriterManager
                    .produceUnstructuredWriter(this.fileFormat,
                            this.fieldDelimiter, sw);
            Record record = null;

            LOG.info(String.format(
                    "begin do write, each object maxFileSize: [%s]MB...",
                    maxPartNumber * 10));
            String currentObject = this.object;
            InitiateMultipartUploadRequest currentInitiateMultipartUploadRequest = null;
            InitiateMultipartUploadResult currentInitiateMultipartUploadResult = null;
            boolean gotData = false;
            List<PartETag> currentPartETags = null;
            // to do:
            // 可以根据currentPartNumber做分块级别的重试,InitiateMultipartUploadRequest多次一个currentPartNumber会覆盖原有
            int currentPartNumber = 1;
            try {
                // warn
                boolean needInitMultipartTransform = true;
                while ((record = lineReceiver.getFromReader()) != null) {
                    gotData = true;
                    // init:begin new multipart upload
                    if (needInitMultipartTransform) {
                        if (objectRollingNumber == 0) {
                            if (StringUtils.isBlank(this.suffix)) {
                                currentObject = this.object;
                            } else {
                                currentObject = String.format("%s%s",
                                        this.object, this.suffix);
                            }
                        } else {
                            // currentObject is like(no suffix)
                            // myfile__9b886b70fbef11e59a3600163e00068c_1
                            if (StringUtils.isBlank(this.suffix)) {
                                currentObject = String.format("%s_%s",
                                        this.object, objectRollingNumber);
                            } else {
                                // or with suffix
                                // myfile__9b886b70fbef11e59a3600163e00068c_1.csv
                                currentObject = String.format("%s_%s%s",
                                        this.object, objectRollingNumber,
                                        this.suffix);
                            }
                        }
                        objectRollingNumber++;
                        currentInitiateMultipartUploadRequest = new InitiateMultipartUploadRequest(
                                this.bucket, currentObject);
                        currentInitiateMultipartUploadResult = this.s3client
                                .initiateMultipartUpload(currentInitiateMultipartUploadRequest);
                        currentPartETags = new ArrayList<PartETag>();
                        LOG.info(String
                                .format("write to bucket: [%s] object: [%s] with oss uploadId: [%s]",
                                        this.bucket, currentObject,
                                        currentInitiateMultipartUploadResult
                                                .getUploadId()));

                        // each object's header
                        if (null != this.header && !this.header.isEmpty()) {
                            unstructuredWriter.writeOneRecord(this.header);
                        }
                        // warn
                        needInitMultipartTransform = false;
                        currentPartNumber = 1;
                    }

                    // write: upload data to current object
                    UnstructuredStorageWriterUtil.transportOneRecord(record,
                            this.nullFormat, this.dateParse,
                            this.getTaskPluginCollector(), unstructuredWriter);

                    if (sb.length() >= partSize) {
                        this.uploadOnePart(sw, currentPartNumber,
                                currentInitiateMultipartUploadResult,
                                currentPartETags, currentObject);
                        currentPartNumber++;
                        sb.setLength(0);
                    }

                    // save: end current multipart upload
                    if (currentPartNumber > maxPartNumber) {
                        LOG.info(String
                                .format("current object [%s] size > %s, complete current multipart upload and begin new one",
                                        currentObject, currentPartNumber
                                                * partSize));
                        CompleteMultipartUploadRequest currentCompleteMultipartUploadRequest = new CompleteMultipartUploadRequest(
                                this.bucket, currentObject,
                                currentInitiateMultipartUploadResult
                                        .getUploadId(), currentPartETags);
                        CompleteMultipartUploadResult currentCompleteMultipartUploadResult = this.s3client
                                .completeMultipartUpload(currentCompleteMultipartUploadRequest);
                        LOG.info(String.format(
                                "final object [%s] etag is:[%s]",
                                currentObject,
                                currentCompleteMultipartUploadResult.getETag()));
                        // warn
                        needInitMultipartTransform = true;
                    }
                }

                if (!gotData) {
                    LOG.info("Receive no data from the source.");
                    currentInitiateMultipartUploadRequest = new InitiateMultipartUploadRequest(
                            this.bucket, currentObject);
                    currentInitiateMultipartUploadResult = this.s3client
                            .initiateMultipartUpload(currentInitiateMultipartUploadRequest);
                    currentPartETags = new ArrayList<PartETag>();
                    // each object's header
                    if (null != this.header && !this.header.isEmpty()) {
                        unstructuredWriter.writeOneRecord(this.header);
                    }
                }
                // warn: may be some data stall in sb
                if (0 < sb.length()) {
                    this.uploadOnePart(sw, currentPartNumber,
                            currentInitiateMultipartUploadResult,
                            currentPartETags, currentObject);
                }
                CompleteMultipartUploadRequest completeMultipartUploadRequest = new CompleteMultipartUploadRequest(
                        this.bucket, currentObject,
                        currentInitiateMultipartUploadResult.getUploadId(),
                        currentPartETags);
                CompleteMultipartUploadResult completeMultipartUploadResult = this.s3client
                        .completeMultipartUpload(completeMultipartUploadRequest);
                LOG.info(String.format("final object etag is:[%s]",
                        completeMultipartUploadResult.getETag()));
            } catch (IOException e) {
                // 脏数据UnstructuredStorageWriterUtil.transportOneRecord已经记录,header
                // 都是字符串不认为有脏数据
                throw DataXException.asDataXException(
                        S3WriterErrorCode.Write_OBJECT_ERROR, e.getMessage());
            } catch (Exception e) {
                throw DataXException.asDataXException(
                        S3WriterErrorCode.Write_OBJECT_ERROR, e.getMessage());
            }
            LOG.info("end do write");
        }

        /**
         * 对于同一个UploadID,该号码不但唯一标识这一块数据,也标识了这块数据在整个文件内的相对位置。
         * 如果你用同一个part号码,上传了新的数据,那么OSS上已有的这个号码的Part数据将被覆盖。
         *
         * @throws Exception
         */
        private void uploadOnePart(
                final StringWriter sw,
                final int partNumber,
                final InitiateMultipartUploadResult initiateMultipartUploadResult,
                final List<PartETag> partETags, final String currentObject)
                throws Exception {
            final String encoding = this.encoding;
            final String bucket = this.bucket;
            final AmazonS3 s3client = this.s3client;
            RetryUtil.executeWithRetry(new Callable<Boolean>() {
                @Override
                public Boolean call() throws Exception {
                    byte[] byteArray = sw.toString().getBytes(encoding);
                    InputStream inputStream = new ByteArrayInputStream(
                            byteArray);
                    // 创建UploadPartRequest,上传分块
                    UploadPartRequest uploadPartRequest = new UploadPartRequest();
                    uploadPartRequest.setBucketName(bucket);
                    uploadPartRequest.setKey(currentObject);
                    uploadPartRequest.setUploadId(initiateMultipartUploadResult
                            .getUploadId());
                    uploadPartRequest.setInputStream(inputStream);
                    uploadPartRequest.setPartSize(byteArray.length);
                    uploadPartRequest.setPartNumber(partNumber);
                    UploadPartResult uploadPartResult = s3client
                            .uploadPart(uploadPartRequest);
                    partETags.add(uploadPartResult.getPartETag());
                    LOG.info(String
                            .format("upload part [%s] size [%s] Byte has been completed.",
                                    partNumber, byteArray.length));
                    IOUtils.closeQuietly(inputStream);
                    return true;
                }
            }, 3, 1000L, false);
        }

        @Override
        public void prepare() {

        }

        @Override
        public void post() {

        }

        @Override
        public void destroy() {

        }
    }
}

S3WriterErrorCode.java

package com.alibaba.datax.plugin.writer.s3writer;

import com.alibaba.datax.common.spi.ErrorCode;

public enum S3WriterErrorCode implements ErrorCode {

    CONFIG_INVALID_EXCEPTION("S3Writer-00", "您的参数配置错误."),
    REQUIRED_VALUE("S3Writer-01", "您缺失了必须填写的参数值."),
    ILLEGAL_VALUE("S3Writer-02", "您填写的参数值不合法."),
    Write_OBJECT_ERROR("S3Writer-03", "您配置的目标Object在写入时异常."),
    S3_COMM_ERROR("S3Writer-05", "执行相应的S3操作异常."),
    ;

    private final String code;
    private final String description;

    private S3WriterErrorCode(String code, String description) {
        this.code = code;
        this.description = description;
    }

    @Override
    public String getCode() {
        return this.code;
    }

    @Override
    public String getDescription() {
        return this.description;
    }

    @Override
    public String toString() {
        return String.format("Code:[%s], Description:[%s].", this.code,
                this.description);
    }

}

plugin.json

{
    "name": "s3writer",
    "class": "com.alibaba.datax.plugin.writer.s3writer.S3Writer",
    "description": "",
    "developer": "chenweifeng"
}

3 调试运行和性能测试

s3reader 和 s3writer 模板

s3reader 和 s3writer 运行结果

DataX是一个用于数据同步的开源工具,它提供了丰富的插件来支持不同的数据源和目标。根据引用[2],DataX插件的开发模式是基于Record的抽象,各个插件只需要按照规范进行开发即可。引用[3]中提到,DataX的打包成功后的包结构中包含了插件目录。 对于Elasticsearch读插件的二次开发,你可以参考DataX的插件开发规范和文档。首先,你需要了解Elasticsearch的数据结构和API,以便在插件中进行数据读取操作。然后,你可以在DataX的插件目录中创建一个新的插件目录,并按照规范进行插件的开发。在插件的配置文件中,你需要指定Elasticsearch的连接信息和查询条件等参数。 在插件的开发过程中,你可以使用DataX提供的各种工具和接口来简化开发和测试。例如,你可以使用DataX的RecordReader接口来读取Elasticsearch中的数据,并将其转换为DataX的Record对象。你还可以使用DataX的各种工具类来处理数据转换和批量写入等操作。 最后,你可以使用DataX的命令行工具来运行你开发的插件,并通过配置文件指定插件的参数和数据源信息。例如,你可以使用类似于引用[1]的命令来运行你的Elasticsearch读插件,并指定数据源的路径和插件的配置文件。 总结起来,要进行DataX的Elasticsearch读插件的二次开发,你需要了解Elasticsearch的数据结构和API,按照DataX的插件开发规范进行插件的开发,使用DataX的工具和接口简化开发和测试,最后使用DataX的命令行工具来运行你开发的插件。
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值