【datax采集ftp、txt、hdfs不支持多字符分隔解决方案】

该方式仅支持TxtFileReader, OSSReader,FtpReader, HdfsReader,其中ftpReader已验证。下文所有描述都只针对这几种数据源。
注意:修改为多字符分隔后,采集数据大概只有之前的1/3。所以源数据如果同时存在单字符和多字符。建议复制一份datax安装包,单独修改插件。

为什么Datax不支持多字符分隔

查看类com.alibaba.datax.plugin.unstructuredstorage.reader.UnstructuredStorageReaderUtil可以看到,上面几种数据源,datax才有CsvReader作为解析类去解析文件流中的字段、行数据。CsvReader的readRecord()方法通过逐个字符读取,并与char类型变量Delimiter的分隔符比较,由此可以看到要想支持多字符分隔,只有自己实现一个解析器类。为了省事,我直接复用CsvReader的大部分代码,仅做部分修改,删除无用判断代码,达到支持多字符分隔的目的。

解决方案

  1. 先从github下载datax源码
  2. idea打开编码,调试各种环境(具体过程略过)
  3. 找到plugin-unstructured-storage-util模块
  4. 新增两个自定义类 com.csvreader.MyCsvReader、com.csvreader.MyCircularFifoQueue
  5. MyCircularFifoQueue定义了一个定长的环形队列,并实现了toString方法,该队列用于存每次从文件流中读取的字符,当读到自定义分隔符长度时,后读进的字符会覆盖最早读到字符,用于判断当前是否读到了分隔符。MyCsvReader为参考CsvReader实现的解析器。
com.csvreader.MyCircularFifoQueue代码如下:
package com.csvreader;

import org.apache.commons.collections4.queue.CircularFifoQueue;
import java.util.Iterator;

/**
 * @author lqlqg
 * 定义定长队列,该队列满足先进先出,当队列满了之后,插入数据会覆盖最早插入的数据,并提供元素转字符串功能
 * 2022/1/18 15:06
 */
public class MyCircularFifoQueue {
   

    // 定义环形队列
    private CircularFifoQueue<Character> circularFifoQueue;
    private final StringBuilder stringBuilder = new StringBuilder();

    // 环形队列初始化,通过分隔符长度来初始化队列长度
    public MyCircularFifoQueue(int i){
   
        circularFifoQueue = new CircularFifoQueue<>(i);
    }

    // 存元素
    public void offer(Character value){
   
        circularFifoQueue.offer(value);
    }

    // 队列元素转字符串
    @Override
    public String toString(){
   
        stringBuilder.setLength(0);
        Iterator<Character> iterator = circularFifoQueue.iterator();
        while (iterator.hasNext()){
   
            stringBuilder.append(iterator.next());
        }
        return stringBuilder.toString();
    }
}
com.csvreader.MyCsvReader代码如下:
package com.csvreader;

import java.io.*;
import java.nio.charset.Charset;
import java.text.NumberFormat;
import java.util.HashMap;

public class MyCsvReader {
   

    private Reader inputStream;//输入流
    private String fileName;//文件名
    private MyCsvReader.UserSettings userSettings;//解析配置文件生成的配置类
    private Charset charset;//字符集
    private boolean useCustomRecordDelimiter;//是否使用自定义记录分隔符
    private MyCsvReader.DataBuffer dataBuffer;
    private MyCsvReader.ColumnBuffer columnBuffer;
    private MyCsvReader.RawRecordBuffer rawBuffer;
    private boolean[] isQualified;
    private String rawRecord;
    private MyCsvReader.HeadersHolder headersHolder;
    private boolean startedColumn; //当前是否开始了字段读取
    private boolean startedWithQualifier;//是否已文本限定符开头
    private boolean hasMoreData;//输入流是否有数据
    private String lastLetter;
    private boolean hasReadNextLine;//是否继续下一行的读取
    private int columnsCount;
    private long currentRecord;
    private String[] values;//存放已解析的行字段值数组
    private boolean initialized;
    private boolean closed;//是否关闭读取

    private class RawRecordBuffer {
   
        public char[] Buffer = new char[500];
        public int Position = 0;

        public RawRecordBuffer() {
   
        }
    }

    private class ColumnBuffer {
   
        public char[] Buffer = new char[50];
        public int Position = 0;

        public ColumnBuffer() {
   
        }
    }

    // 输入流的数据缓冲区,一次读取1024个字节
    private class DataBuffer {
   
        public char[] Buffer = new char[1024];
        public int Position = 0;//记录当前程序读到的位置
        public int Count = 0;//从输入流中读取的数据长度
        public int ColumnStart = 0;//记录当前解析的字段的起始位置
        public int LineStart = 0;//记录当前行的起始位置

        public DataBuffer() {
   
        }
    }

    private class UserSettings {
   
        public char TextQualifier = '"'; //文本限定符
        public boolean TrimWhitespace = true;//是否去空格
        public boolean UseTextQualifier = true; //是否使用文本限定符
        public String Delimiter = ",";//字段分隔符
        public char RecordDelimiter = 0;//记录分隔符
        public char Comment = '#';//注释字符
        public boolean UseComments = false;//是否解析注释
        public boolean SafetySwitch = true;//安全校验开关
        public boolean SkipEmptyRecords = true;//跳过空行
        public boolean CaptureRawRecord = true;//是否捕获行记录

        public UserSettings() {
   
        }
    }

    /**
     * 从缓冲区中读取记录行,删掉了原CsvReader中没有使用代码
     */
    public boolean readRecord() throws IOException {
   
        // 定义队列,记录当前读字符往前倒推分隔符长度,判断是否分隔符
        MyCircularFifoQueue myQueue = new MyCircularFifoQueue(this.userSettings.Delimiter.length());
        this.checkClosed();
        this.columnsCount = 0; //当前行已读取的字段数
        this.rawBuffer.Position = 0;//行缓冲区位置
        this.dataBuffer.LineStart = this.dataBuffer.Position;//数据缓冲区中行起始位置
        this.hasReadNextLine = false;//是否有下一行可读,默认不可读,当前行读完才会去校验
        if (this.hasMoreData) {
   
            while (true) {
   
                // 初始化,刷新数据缓冲区数据
                if (this.dataBuffer.Position == this.dataBuffer.Count) {
   
                    this.checkDataLength();
                } else {
   
                    this.startedWithQualifier = false;//无用
                    // 读取新字段或新行的第一个字符
                    char var1 = this.dataBuffer.Buffer[this.dataBuffer.Position];
                    myQueue.offer(var1);
                    if (this.userSettings.Delimiter.equals(myQueue.toString())) {
   
                        this.lastLetter = myQueue.toString(); //如果当前读取位置为分隔符
                        this.endColumn();
                    } else if (this.useCustomRecordDelimiter || var1 != '\r' && var1 != '\n') {
   
                        // 采集数据注释行处理,此判断可以删除
                        if (this.userSettings.UseComments && this.columnsCount == 0 && var1 == this.userSettings.Comment) {
   
                            this.lastLetter = String.valueOf(var1);
                            this.skipLine();
                        // 字段是否去头空字符串
                        } else if (this.userSettings.TrimWhitespace && (var1 == ' ' || var1 == '\t')) {
   
                            this.startedColumn = true;
                            this.dataBuffer.ColumnStart = this.dataBuffer.Position + 1;
                            this.lastLetter = String.valueOf(var1);
                        } else {
   
                            this.startedColumn = true;
                            this.dataBuffer.ColumnStart = this.dataBuffer.Position;
                            boolean var3 = false;
                            byte var4 = 1;
                            int var5 = 0;
                            char var6 = 0;
                            boolean var7 = true;

                            //循环读取字段,由于字段长度不定长,使用while循环,直到读到换行符退出
                            do {
   
                                if (!var7 && this.dataBuffer.Position == this.dataBuffer.Count) {
   
                                    this.checkDataLength();//buffer读完,刷新数据缓冲区数据
                                } else {
   

                                    if (!var7) {
   
                                        var1 = this.dataBuffer.Buffer[this.dataBuffer.Position];
                                        myQueue.offer(var1);
                                    }

                                   if (var3) {
   
                                        ++var5;
                                        switch (var4) {
   
                                            case 1:
                                                var6 = (char) (var6 * 16);
                                                var6 += hexToDec(var1);
                                                if (var5 == 4) {
   
                                                    var3 = false;
                                                }
                                                break;
                                            case 2:
                                                var6 = (char) (var6 * 8);
                                                var6 += (char) (var1 - 48);
                                                if (var5 == 3) {
   
                                                    var3 = false;
                                                }
                                                break;
                                            case 3:
                                                var6 = (char) (var6 * 10);
                                                var6 += (char) (var1 - 48);
                                                if (var5 == 3) {
   
                                                    var3 = false;
                                                }
                                                break;
                                            case 4:
                                                var6 = (char) (var6 * 16);
                                                var6 += hexToDec(var1);
                                                if (var5 == 2) {
   
                                                    var3 = false;
                                                }
                                        }

                                       if (!var3) {
   
                                           this.appendLetter(var6);
                                       } else {
   
                                           this.dataBuffer.ColumnStart = this.dataBuffer.Position + 1;
                                       }
                                    }  else if (this.userSettings.Delimiter.equals(myQueue.toString())) {
   
                                        //如果var1=分隔符
                                       this.lastLetter = myQueue.toString();
                                       this.endColumn();
                                    } else if (!this.useCustomRecordDelimiter && (var1 == '\r' || var1 == '\n') || this.useCustomRecordDelimiter && var1 == this.userSettings.RecordDelimiter) {
   
                                       this.lastLetter = String.valueOf(var1);
                                       this.endColumn();
                                       this.endRecord();
                                    }

                                    var7 = false;
                                    if (this.startedColumn) {
   
                                        ++this.dataBuffer.Position;
                                        if (this.userSettings.SafetySwitch && this.dataBuffer.Position - this.dataBuffer.ColumnStart + this.columnBuffer.Position > 100000) {
   
                                            this.close();
                                            throw new IOException("Maximum column length of 100,000 exceeded in column " + NumberFormat.getIntegerInstance().format((long) this.columnsCount) + " in record " + NumberFormat.getIntegerInstance().format(this.currentRecord) + ". Set the SafetySwitch property to false" + " if you're expecting column lengths greater than 100,000 characters to" + " avoid this error.");
                                        }
                                    }
                                }
                            } while (this.hasMoreData && this.startedColumn);
                        }
                    } else {
   
                        this.lastLetter = String.valueOf(var1);
                        if (!this.startedColumn && this.columnsCount <= 0 && (this.userSettings.SkipEmptyRecords || var1 != '\r' && this.lastLetter == String.valueOf('\r'))) {
   
                            this.dataBuffer.LineStart = this.dataBuffer.Position + 1;
                        } else {
   
                            this.endColumn();
                            this.endRecord();
                        }
                    }

                    if (this.hasMoreData) {
   
                        this.dataBuffer.Position++;
                    }
                }

                if (!this.hasMoreData || this.hasReadNextLine) {
   
                    if (this.startedColumn || this.lastLetter.equals(this.userSettings.Delimiter)) {
   
                        this.endColumn();
                        this.endRecord();
                    }
                    break;
                }
            }
        }

        // 是否捕获行记录,用于打印
        if (this.userSettings.CaptureRawRecord) {
   
            if (this.hasMoreData) {
   
                //如果一行数据读完,buffer还有数据
                if (this.rawBuffer.Position == 0) {
   
                    this.rawRecord = new String(this.dataBuffer.Buffer, this.dataBuffer.LineStart,
                            this.dataBuffer.Position - this.dataBuffer.LineStart - 1);
                } else {
   
                    this.rawRecord = new String(this.rawBuffer.Buffer, 0, this.rawBuffer.Position) + new String(this.dataBuffer.Buffer,
                            this.dataBuffer.LineStart, this.dataBuffer.Position - this.dataBuffer.LineStart - 1);
                }
            } else {
   
                this.rawRecord = new String(this.rawBuffer.Buffer, 0, this.rawBuffer.Position);
            }
        } else {
   
            this.rawRecord = "";
        }

        return this.hasReadNextLine;
    }

    private void endColumn() throws IOException {
   
        String var1 = "";
        int var2;
        if (this.startedColumn) {
   
            // columnBuffer用于存放上次读取到一半的数据,如果this.columnBuffer.Position,则直接从this.dataBuffer.ColumnStart读到分隔符即为字段值
            if (this.columnBuffer.Position == 0) {
   
                if (this.dataBuffer.ColumnStart < this.dataBuffer.Position) {
   
                    var2 = this.dataBuffer.Position - this.lastLetter.length();// 当前位置 减掉分隔符长度
                    if (this.userSettings.TrimWhitespace && !this.startedWithQualifier) {
   
                        // 删去字段结尾的空格或制表符
                        while 
  • 5
    点赞
  • 13
    收藏
    觉得还不错? 一键收藏
  • 5
    评论
评论 5
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值