该方式仅支持TxtFileReader, OSSReader,FtpReader, HdfsReader,其中ftpReader已验证。下文所有描述都只针对这几种数据源。
注意:修改为多字符分隔后,采集数据大概只有之前的1/3。所以源数据如果同时存在单字符和多字符。建议复制一份datax安装包,单独修改插件。
为什么Datax不支持多字符分隔
查看类com.alibaba.datax.plugin.unstructuredstorage.reader.UnstructuredStorageReaderUtil可以看到,上面几种数据源,datax才有CsvReader作为解析类去解析文件流中的字段、行数据。CsvReader的readRecord()方法通过逐个字符读取,并与char类型变量Delimiter的分隔符比较,由此可以看到要想支持多字符分隔,只有自己实现一个解析器类。为了省事,我直接复用CsvReader的大部分代码,仅做部分修改,删除无用判断代码,达到支持多字符分隔的目的。
解决方案
- 先从github下载datax源码
- idea打开编码,调试各种环境(具体过程略过)
- 找到plugin-unstructured-storage-util模块
- 新增两个自定义类 com.csvreader.MyCsvReader、com.csvreader.MyCircularFifoQueue
- MyCircularFifoQueue定义了一个定长的环形队列,并实现了toString方法,该队列用于存每次从文件流中读取的字符,当读到自定义分隔符长度时,后读进的字符会覆盖最早读到字符,用于判断当前是否读到了分隔符。MyCsvReader为参考CsvReader实现的解析器。
com.csvreader.MyCircularFifoQueue代码如下:
package com.csvreader;
import org.apache.commons.collections4.queue.CircularFifoQueue;
import java.util.Iterator;
/**
* @author lqlqg
* 定义定长队列,该队列满足先进先出,当队列满了之后,插入数据会覆盖最早插入的数据,并提供元素转字符串功能
* 2022/1/18 15:06
*/
public class MyCircularFifoQueue {
// 定义环形队列
private CircularFifoQueue<Character> circularFifoQueue;
private final StringBuilder stringBuilder = new StringBuilder();
// 环形队列初始化,通过分隔符长度来初始化队列长度
public MyCircularFifoQueue(int i){
circularFifoQueue = new CircularFifoQueue<>(i);
}
// 存元素
public void offer(Character value){
circularFifoQueue.offer(value);
}
// 队列元素转字符串
@Override
public String toString(){
stringBuilder.setLength(0);
Iterator<Character> iterator = circularFifoQueue.iterator();
while (iterator.hasNext()){
stringBuilder.append(iterator.next());
}
return stringBuilder.toString();
}
}
com.csvreader.MyCsvReader代码如下:
package com.csvreader;
import java.io.*;
import java.nio.charset.Charset;
import java.text.NumberFormat;
import java.util.HashMap;
public class MyCsvReader {
private Reader inputStream;//输入流
private String fileName;//文件名
private MyCsvReader.UserSettings userSettings;//解析配置文件生成的配置类
private Charset charset;//字符集
private boolean useCustomRecordDelimiter;//是否使用自定义记录分隔符
private MyCsvReader.DataBuffer dataBuffer;
private MyCsvReader.ColumnBuffer columnBuffer;
private MyCsvReader.RawRecordBuffer rawBuffer;
private boolean[] isQualified;
private String rawRecord;
private MyCsvReader.HeadersHolder headersHolder;
private boolean startedColumn; //当前是否开始了字段读取
private boolean startedWithQualifier;//是否已文本限定符开头
private boolean hasMoreData;//输入流是否有数据
private String lastLetter;
private boolean hasReadNextLine;//是否继续下一行的读取
private int columnsCount;
private long currentRecord;
private String[] values;//存放已解析的行字段值数组
private boolean initialized;
private boolean closed;//是否关闭读取
private class RawRecordBuffer {
public char[] Buffer = new char[500];
public int Position = 0;
public RawRecordBuffer() {
}
}
private class ColumnBuffer {
public char[] Buffer = new char[50];
public int Position = 0;
public ColumnBuffer() {
}
}
// 输入流的数据缓冲区,一次读取1024个字节
private class DataBuffer {
public char[] Buffer = new char[1024];
public int Position = 0;//记录当前程序读到的位置
public int Count = 0;//从输入流中读取的数据长度
public int ColumnStart = 0;//记录当前解析的字段的起始位置
public int LineStart = 0;//记录当前行的起始位置
public DataBuffer() {
}
}
private class UserSettings {
public char TextQualifier = '"'; //文本限定符
public boolean TrimWhitespace = true;//是否去空格
public boolean UseTextQualifier = true; //是否使用文本限定符
public String Delimiter = ",";//字段分隔符
public char RecordDelimiter = 0;//记录分隔符
public char Comment = '#';//注释字符
public boolean UseComments = false;//是否解析注释
public boolean SafetySwitch = true;//安全校验开关
public boolean SkipEmptyRecords = true;//跳过空行
public boolean CaptureRawRecord = true;//是否捕获行记录
public UserSettings() {
}
}
/**
* 从缓冲区中读取记录行,删掉了原CsvReader中没有使用代码
*/
public boolean readRecord() throws IOException {
// 定义队列,记录当前读字符往前倒推分隔符长度,判断是否分隔符
MyCircularFifoQueue myQueue = new MyCircularFifoQueue(this.userSettings.Delimiter.length());
this.checkClosed();
this.columnsCount = 0; //当前行已读取的字段数
this.rawBuffer.Position = 0;//行缓冲区位置
this.dataBuffer.LineStart = this.dataBuffer.Position;//数据缓冲区中行起始位置
this.hasReadNextLine = false;//是否有下一行可读,默认不可读,当前行读完才会去校验
if (this.hasMoreData) {
while (true) {
// 初始化,刷新数据缓冲区数据
if (this.dataBuffer.Position == this.dataBuffer.Count) {
this.checkDataLength();
} else {
this.startedWithQualifier = false;//无用
// 读取新字段或新行的第一个字符
char var1 = this.dataBuffer.Buffer[this.dataBuffer.Position];
myQueue.offer(var1);
if (this.userSettings.Delimiter.equals(myQueue.toString())) {
this.lastLetter = myQueue.toString(); //如果当前读取位置为分隔符
this.endColumn();
} else if (this.useCustomRecordDelimiter || var1 != '\r' && var1 != '\n') {
// 采集数据注释行处理,此判断可以删除
if (this.userSettings.UseComments && this.columnsCount == 0 && var1 == this.userSettings.Comment) {
this.lastLetter = String.valueOf(var1);
this.skipLine();
// 字段是否去头空字符串
} else if (this.userSettings.TrimWhitespace && (var1 == ' ' || var1 == '\t')) {
this.startedColumn = true;
this.dataBuffer.ColumnStart = this.dataBuffer.Position + 1;
this.lastLetter = String.valueOf(var1);
} else {
this.startedColumn = true;
this.dataBuffer.ColumnStart = this.dataBuffer.Position;
boolean var3 = false;
byte var4 = 1;
int var5 = 0;
char var6 = 0;
boolean var7 = true;
//循环读取字段,由于字段长度不定长,使用while循环,直到读到换行符退出
do {
if (!var7 && this.dataBuffer.Position == this.dataBuffer.Count) {
this.checkDataLength();//buffer读完,刷新数据缓冲区数据
} else {
if (!var7) {
var1 = this.dataBuffer.Buffer[this.dataBuffer.Position];
myQueue.offer(var1);
}
if (var3) {
++var5;
switch (var4) {
case 1:
var6 = (char) (var6 * 16);
var6 += hexToDec(var1);
if (var5 == 4) {
var3 = false;
}
break;
case 2:
var6 = (char) (var6 * 8);
var6 += (char) (var1 - 48);
if (var5 == 3) {
var3 = false;
}
break;
case 3:
var6 = (char) (var6 * 10);
var6 += (char) (var1 - 48);
if (var5 == 3) {
var3 = false;
}
break;
case 4:
var6 = (char) (var6 * 16);
var6 += hexToDec(var1);
if (var5 == 2) {
var3 = false;
}
}
if (!var3) {
this.appendLetter(var6);
} else {
this.dataBuffer.ColumnStart = this.dataBuffer.Position + 1;
}
} else if (this.userSettings.Delimiter.equals(myQueue.toString())) {
//如果var1=分隔符
this.lastLetter = myQueue.toString();
this.endColumn();
} else if (!this.useCustomRecordDelimiter && (var1 == '\r' || var1 == '\n') || this.useCustomRecordDelimiter && var1 == this.userSettings.RecordDelimiter) {
this.lastLetter = String.valueOf(var1);
this.endColumn();
this.endRecord();
}
var7 = false;
if (this.startedColumn) {
++this.dataBuffer.Position;
if (this.userSettings.SafetySwitch && this.dataBuffer.Position - this.dataBuffer.ColumnStart + this.columnBuffer.Position > 100000) {
this.close();
throw new IOException("Maximum column length of 100,000 exceeded in column " + NumberFormat.getIntegerInstance().format((long) this.columnsCount) + " in record " + NumberFormat.getIntegerInstance().format(this.currentRecord) + ". Set the SafetySwitch property to false" + " if you're expecting column lengths greater than 100,000 characters to" + " avoid this error.");
}
}
}
} while (this.hasMoreData && this.startedColumn);
}
} else {
this.lastLetter = String.valueOf(var1);
if (!this.startedColumn && this.columnsCount <= 0 && (this.userSettings.SkipEmptyRecords || var1 != '\r' && this.lastLetter == String.valueOf('\r'))) {
this.dataBuffer.LineStart = this.dataBuffer.Position + 1;
} else {
this.endColumn();
this.endRecord();
}
}
if (this.hasMoreData) {
this.dataBuffer.Position++;
}
}
if (!this.hasMoreData || this.hasReadNextLine) {
if (this.startedColumn || this.lastLetter.equals(this.userSettings.Delimiter)) {
this.endColumn();
this.endRecord();
}
break;
}
}
}
// 是否捕获行记录,用于打印
if (this.userSettings.CaptureRawRecord) {
if (this.hasMoreData) {
//如果一行数据读完,buffer还有数据
if (this.rawBuffer.Position == 0) {
this.rawRecord = new String(this.dataBuffer.Buffer, this.dataBuffer.LineStart,
this.dataBuffer.Position - this.dataBuffer.LineStart - 1);
} else {
this.rawRecord = new String(this.rawBuffer.Buffer, 0, this.rawBuffer.Position) + new String(this.dataBuffer.Buffer,
this.dataBuffer.LineStart, this.dataBuffer.Position - this.dataBuffer.LineStart - 1);
}
} else {
this.rawRecord = new String(this.rawBuffer.Buffer, 0, this.rawBuffer.Position);
}
} else {
this.rawRecord = "";
}
return this.hasReadNextLine;
}
private void endColumn() throws IOException {
String var1 = "";
int var2;
if (this.startedColumn) {
// columnBuffer用于存放上次读取到一半的数据,如果this.columnBuffer.Position,则直接从this.dataBuffer.ColumnStart读到分隔符即为字段值
if (this.columnBuffer.Position == 0) {
if (this.dataBuffer.ColumnStart < this.dataBuffer.Position) {
var2 = this.dataBuffer.Position - this.lastLetter.length();// 当前位置 减掉分隔符长度
if (this.userSettings.TrimWhitespace && !this.startedWithQualifier) {
// 删去字段结尾的空格或制表符
while