项目场景:
Addax读取文件插入到数据库,读取文件过程中遇到的 双引号“”不规范问题
问题描述
正常文件内容数据格式,如:01,"张三","北京",19 等四个字段
(1)文件中存在单边",如:01,"张三,“北京”,19
读取过程中会将 “张三,” 切分为一个字段,从而导致后面字段无法根据分隔符 逗号, 继续完成切分
(2)文件中将多个字段用" "包一起,如:02,"李四,上海",19
读取过程中会将 “张三,北京” 切分为一个字段,从而导致文件截取错位、字段变少,配置文件在读取索引为3的字段19时,报错“索引越界”:
"The column index 3 you try to read is out of range(2)
原因分析:
Addax底层是由Java语言编写的,Java中将一个完整双引号""中的内容默认为是一个字符串
解决方案:
这里推荐两种解决方案:
(1)修改addax源码,在读取文件前将“替换
(2)写一个Python脚本,将文件读取替换“后写回,再用addax抽取
- 方案(1):
在 lib
目录 addax-storage
模块下 src/main/java/com/wgzhao/addax/storage/reader/StorageReaderUtil.java;
类中流文件读取之前添加替换操作
public static void readFromStream(InputStream inputStream, String fileName,
Configuration readerSliceConfig, RecordSender recordSender,
TaskPluginCollector taskPluginCollector)
{
String compress = readerSliceConfig.getString(Key.COMPRESS, "");
String encoding = readerSliceConfig.getString(Key.ENCODING, Constant.DEFAULT_ENCODING);
// handle blank encoding
if (StringUtils.isBlank(encoding)) {
encoding = Constant.DEFAULT_ENCODING;
LOG.warn("The encoding: '{}' is illegal, uses '{}' by default", encoding, Constant.DEFAULT_ENCODING);
}
List<Configuration> column = readerSliceConfig.getListConfiguration(Key.COLUMN);
// handle ["*"] -> [], null
if (null != column && 1 == column.size() && "\"*\"".equals(column.get(0).toString())) {
readerSliceConfig.set(Key.COLUMN, null);
}
BufferedReader reader = null;
int bufferSize = readerSliceConfig.getInt(Key.BUFFER_SIZE, Constant.DEFAULT_BUFFER_SIZE);
// compress logic
try {
//if (compress == null || "".equals(compress) || "none".equalsIgnoreCase(compress)) {
// reader = new BufferedReader(new InputStreamReader(inputStream, encoding), bufferSize);
//}
//else {
// if ("zip".equalsIgnoreCase(compress)) {
// ZipCycleInputStream zipCycleInputStream = new ZipCycleInputStream(inputStream);
// reader = new BufferedReader(new InputStreamReader(zipCycleInputStream, encoding), bufferSize);
// }
// else if ("lzo".equalsIgnoreCase(compress)) {
// ExpandLzopInputStream expandLzopInputStream = new ExpandLzopInputStream(inputStream);
// reader = new BufferedReader(new InputStreamReader(expandLzopInputStream, encoding), bufferSize);
// }
// else {
// // common-compress supports almost compress alg
// CompressorInputStream input = new CompressorStreamFactory().createCompressorInputStream(compress.toUpperCase(), inputStream, true);
// reader = new BufferedReader(new InputStreamReader(input, encoding), bufferSize);
// }
//}
if (compress == null || "".equals(compress) || "none".equalsIgnoreCase(compress)) {
inputStream = processInputStream(inputStream, "\"+", "");
reader = new BufferedReader(new InputStreamReader(inputStream, encoding), bufferSize);
} else {
if ("gz".equalsIgnoreCase(compress)) {
inputStream = gzInputStream(inputStream, "\"+", "");
GZIPInputStream gzipInputStream = new GZIPInputStream(inputStream);
reader = new BufferedReader(new InputStreamReader(gzipInputStream, encoding), bufferSize);
} else if ("zip".equalsIgnoreCase(compress)) {
inputStream = zipInputStream(inputStream, "\"+", "");
ZipCycleInputStream zipCycleInputStream = new ZipCycleInputStream(inputStream);
reader = new BufferedReader(new InputStreamReader(zipCycleInputStream, encoding), bufferSize);
} else if ("lzo".equalsIgnoreCase(compress)) {
ExpandLzopInputStream expandLzopInputStream = new ExpandLzopInputStream(inputStream);
reader = new BufferedReader(new InputStreamReader(expandLzopInputStream, encoding), bufferSize);
} else {
// common-compress supports almost compress alg
CompressorInputStream input = new CompressorStreamFactory().createCompressorInputStream(compress.toUpperCase(), inputStream, true);
reader = new BufferedReader(new InputStreamReader(input, encoding), bufferSize);
}
}
StorageReaderUtil.doReadFromStream(reader, fileName, readerSliceConfig, recordSender, taskPluginCollector);
}
catch (UnsupportedEncodingException uee) {
throw AddaxException.asAddaxException(
StorageReaderErrorCode.OPEN_FILE_WITH_CHARSET_ERROR,
String.format("%s is unsupported", encoding), uee);
}
catch (NullPointerException e) {
throw AddaxException.asAddaxException(
StorageReaderErrorCode.RUNTIME_EXCEPTION, e);
}
catch (IOException e) {
throw AddaxException.asAddaxException(
StorageReaderErrorCode.READ_FILE_IO_ERROR, String.format("Read stream %s failure ", fileName), e);
}
catch (CompressorException e) {
throw AddaxException.asAddaxException(
StorageReaderErrorCode.ILLEGAL_VALUE,
"The compress algorithm'" + compress + "' is unsupported yet"
);
}
finally {
IOUtils.closeQuietly(reader, null);
}
}
上述代码中,注释部分为原代码,注释下方为更新后代码,分别调用相应的方法将流文件中""+“替换为”"空。其中,相应调用方法如下:
public static InputStream processInputStream(InputStream inputStream, String searchString, String replaceString) throws IOException {
// 使用 BufferedReader 按行读取输入流
BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8));
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
String line;
while ((line = reader.readLine()) != null) {
// 替换行中的内容
line = line.replaceAll(searchString, replaceString);
// 将替换后的行写入 ByteArrayOutputStream
outputStream.write(line.getBytes(StandardCharsets.UTF_8));
outputStream.write(System.lineSeparator().getBytes(StandardCharsets.UTF_8));
}
// 关闭输入流和输出流
reader.close();
outputStream.close();
// 将 ByteArrayOutputStream 转换为 InputStream 并返回
return new ByteArrayInputStream(outputStream.toByteArray());
}
public static InputStream gzInputStream(InputStream inputStream, String searchString, String replaceString) throws IOException {
// 使用 BufferedReader 按行读取输入流
BufferedReader reader = new BufferedReader(new InputStreamReader(new GZIPInputStream(inputStream), StandardCharsets.UTF_8));
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
GZIPOutputStream gzipOutputStream = new GZIPOutputStream(outputStream);
String line;
while ((line = reader.readLine()) != null) {
// 替换行中的内容
line = line.replaceAll(searchString, replaceString);
// 将替换后的行写入 压缩输出流
gzipOutputStream.write(line.getBytes(StandardCharsets.UTF_8));
gzipOutputStream.write(System.lineSeparator().getBytes(StandardCharsets.UTF_8));
}
// 关闭输入流和输出流
reader.close();
gzipOutputStream.finish();
outputStream.close();
// 将 ByteArrayOutputStream 转换为 InputStream 并返回
return new ByteArrayInputStream(outputStream.toByteArray());
}
public static InputStream zipInputStream(InputStream inputStream, String searchString, String replaceString) throws IOException {
ZipInputStream zipInputStream = new ZipInputStream(inputStream);
ByteArrayOutputStream resultOutputStream = new ByteArrayOutputStream();
ZipOutputStream zipOutputStream = new ZipOutputStream(resultOutputStream);
ZipEntry entry;
while ((entry = zipInputStream.getNextEntry()) != null) {
// 对每个ZipEntry进行处理
ByteArrayOutputStream entryOutputStream = new ByteArrayOutputStream();
byte[] buffer = new byte[1024];
int bytesRead;
while ((bytesRead = zipInputStream.read(buffer)) != -1) {
entryOutputStream.write(buffer, 0, bytesRead);
}
// 在这里对entryOutputStream中的内容进行处理,这里只是一个示例,您可以根据实际需求进行替换等操作
String processedContent = new String(entryOutputStream.toByteArray(), "UTF-8").replaceAll(searchString, replaceString);
// 将处理后的内容写入到zipOutputStream中
zipOutputStream.putNextEntry(new ZipEntry(entry.getName()));
zipOutputStream.write(processedContent.getBytes("UTF-8"));
zipOutputStream.closeEntry();
}
zipOutputStream.finish();
zipOutputStream.close();
//return resultOutputStream;
return new ByteArrayInputStream(resultOutputStream.toByteArray());
}
public static InputStream lzoInputStream(InputStream inputStream, String searchString, String replaceString) throws IOException {
// 创建LzopInputStream以解压缩LZO数据
LzopInputStream lzopInputStream = new LzopInputStream(inputStream);
// 创建一个内存中的字节流,用于保存处理后的数据
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
// 创建LzopOutputStream以将处理后的数据压缩为LZO格式
//LzopOutputStream lzopOutputStream = new LzopOutputStream(outputStream);
LzoOutputStream lzopOutputStream = new LzoOutputStream(outputStream);
// 逐行处理
BufferedReader reader = new BufferedReader(new InputStreamReader(lzopInputStream, StandardCharsets.UTF_8));
String line;
while ((line = reader.readLine()) != null) {
// 在这里对每行内容进行处理,这里只是一个示例,您可以根据实际需求进行替换等操作
String processedLine = line.replace("oldValue", "newValue") + "\n";
lzopOutputStream.write(processedLine.getBytes(StandardCharsets.UTF_8));
}
// 关闭输入流和输出流
lzopOutputStream.close();
lzopInputStream.close();
// 将处理后的数据作为InputStream返回
return new ByteArrayInputStream(outputStream.toByteArray());
}
- 方案(2):
import datetime
import argparse
import paramiko
import gzip
import xml.etree.cElementTree as cet
import time
from impala import dbapi
import os
import chardet
def sftp_get(sftp, source, target):
file_boolean = True
try:
sftp.get(remotepath=source, localpath=target)
except FileNotFoundError as e:
file_boolean = False
print("sftp_get ERROR: ", e)
sftp.close()
return file_boolean
def sftp_put(sftp, source, target):
file_boolean = True
try:
sftp.put(localpath=source, remotepath=target)
except FileNotFoundError as e:
file_boolean = False
print("sftp_put ERROR: ", e)
sftp.close()
return file_boolean
def create_sftp_connect():
# 获取Transport实例
try:
transport = paramiko.Transport(('host',port))
transport.connect(username="...", password="...")
sftp_ob = paramiko.SFTPClient.from_transport(transport)
except Exception as e:
print("SFTP 连接失败 :", e)
exit(255)
return sftp_ob
def create_target_sftp_connect():
# 获取Transport实例
try:
transport = paramiko.Transport(('host',port))
transport.connect(username="...", password="...")
sftp_ob = paramiko.SFTPClient.from_transport(transport)
except Exception as e:
print("SFTP 连接失败 :", e)
exit(255)
return sftp_ob
# 创建sftp连接,下载源sftp文件到本地,然后同步至目标sftp
if __name__ == '__main__':
START = datetime.datetime.now()
print("-------------开始时间-------------:" + str(START))
print("\n----------创建sftp连接----------\n")
source_sftp = create_sftp_connect()
print("source_sftp连接: ", source_sftp)
source_sftp_path = "/download/"
local_path = "/tmp/"
curr_file_name = "20231026_2800_xxx_B100_"
target_sftp = create_target_sftp_connect()
print("target_sftp连接: ", target_sftp)
target_sftp_path = "/data/sftpSyncFile/"
try:
file_list = source_sftp.listdir(source_sftp_path)
day_file_name = []
for file_name in file_list:
if file_name.startswith(curr_file_name):
day_file_name.append(file_name)
source_sftp_file_name = source_sftp_path + file_name
local_file_name = local_path + file_name
# 下载文件
sftp_get_boolean = sftp_get(source_sftp, source_sftp_file_name, local_file_name)
if sftp_get_boolean is True:
# **------打开并替换文件中的字符串------**
with open(local_file_name, 'r', encoding='gbk') as f:
content = f.read()
# 编码格式
code = chardet.detect(content)
print("编码格式: ", code)
content = content.replace('"', '')
with open(local_file_name, 'w', encoding=code) as f:
f.write(content)
else:
print("未找到文件:", sftp_get_boolean)
exit(0)
# 上传文件
target_sftp_file_name = target_sftp_path + file_name
local_file_boolean = sftp_put(target_sftp, local_file_name, target_sftp_file_name)
print("文件列表day_file_name: ", day_file_name)
except Exception as e:
print("sync sftp file ERROR: ", e)
exit(255)
finally:
print("----------遍历文件列表,移除local文件----------\n")
for local_file_name in day_file_name:
local_file_name = "/tmp/" + local_file_name
print(local_file_name)
os.remove(local_file_name)
print("----------关闭sftp连接----------\n")
source_sftp.close()
target_sftp.close()
print("总文件数: ", len(day_file_name))
END = datetime.datetime.now()
print("-------------结束时间-------------:" + str(END))
print("-------------共耗时-------------...." + str(END - START))
如有不足,请多多指正!