DataX同步hdfs文件系统添加写parquet文件功能
datax介绍
DataX阿里开源数据同步工具,实现了包括 MySQL、Oracle、OceanBase、SqlServer、Postgre、HDFS、Hive、ADS、HBase、TableStore(OTS)、MaxCompute(ODPS)、Hologres、DRDS 等各种异构数据源之间高效的数据同步功能。
问题
目前DataX在写HDFS时支持 text和orc格式的文件,不支持列式存储文件 parquet
解决方式
- DataX是开源工具,所以我们可以获取起源码,对源码进行修改,DataX是通过python调用的,但是大部分功能都是通过Java实现,因此需要修改Java代码,然后重新打包,替换原始jar包即可。
DataX git地址:https://github.com/alibaba/DataX - 在hive中添加支持 parquet的数据表
create table(id bigint) stored as parquet
- 找到写hdfs功能代码,文件位置:DataX/tree/master/hdfswriter/src/main/java/com/alibaba/datax/plugin/writer/hdfswriter/HdfsWriter.java
修改后的代码如下:
private void validateParameter() {
this.defaultFS = this.writerSliceConfig.getNecessaryValue(Key.DEFAULT_FS, HdfsWriterErrorCode.REQUIRED_VALUE);
//fileType check
this.fileType = this.writerSliceConfig.getNecessaryValue(Key.FILE_TYPE, HdfsWriterErrorCode.REQUIRED_VALUE);
if( !fileType.equalsIgnoreCase("ORC") && !fileType.equalsIgnoreCase("TEXT") && !fileType.equalsIgnoreCase("PARQUET")){
String message = "HdfsWriter插件目前只支持ORC和TEXT和PARQUET三种格式的文件,请将filetype选项的值配置为ORC或者TEXT或者PARQUET";
throw DataXException.asDataXException(HdfsWriterErrorCode.ILLEGAL_VALUE, message);
}
//path
this.path = this.writerSliceConfig.getNecessaryValue(Key.PATH, HdfsWriterErrorCode.REQUIRED_VALUE);
if(!path.startsWith("/")){
String message = String.format("请检查参数path:[%s],需要配置为绝对路径", path);
LOG.error(message);
throw DataXException.asDataXException(HdfsWriterErrorCode.ILLEGAL_VALUE, message);
}else if(path.contains("*") || path.contains("?")){
String message = String.format("请检查参数path:[%s],不能包含*,?等特殊字符", path);
LOG.error(message);
throw DataXException.asDataXException(HdfsWriterErrorCode.ILLEGAL_VALUE, message);
}
//fileName
this.fileName = this.writerSliceConfig.getNecessaryValue(Key.FILE_NAME, HdfsWriterErrorCode.REQUIRED_VALUE);
//columns check
this.columns = this.writerSliceConfig.getListConfiguration(Key.COLUMN);
if (null == columns || columns.size() == 0) {
throw DataXException.asDataXException(HdfsWriterErrorCode.REQUIRED_VALUE, "您需要指定 columns");
}else{
for (Configuration eachColumnConf : columns) {
eachColumnConf.getNecessaryValue(Key.NAME, HdfsWriterErrorCode.COLUMN_REQUIRED_VALUE);
eachColumnConf.getNecessaryValue(Key.TYPE, HdfsWriterErrorCode.COLUMN_REQUIRED_VALUE);
}
}
//writeMode check
this.writeMode = this.writerSliceConfig.getNecessaryValue(Key.WRITE_MODE, HdfsWriterErrorCode.REQUIRED_VALUE);
writeMode = writeMode.toLowerCase().trim();
Set<String> supportedWriteModes = Sets.newHashSet("append", "nonconflict", "truncate");
if (!supportedWriteModes.contains(writeMode)) {
throw DataXException.asDataXException(HdfsWriterErrorCode.ILLEGAL_VALUE,
String.format("仅支持append, nonConflict, truncate三种模式, 不支持您配置的 writeMode 模式 : [%s]",
writeMode));
}
this.writerSliceConfig.set(Key.WRITE_MODE, writeMode);
//fieldDelimiter check
this.fieldDelimiter = this.writerSliceConfig.getString(Key.FIELD_DELIMITER,null);
if(null == fieldDelimiter){
throw DataXException.asDataXException(HdfsWriterErrorCode.REQUIRED_VALUE,
String.format("您提供配置文件有误,[%s]是必填参数.", Key.FIELD_DELIMITER));
}else if(1 != fieldDelimiter.length()){
// warn: if have, length must be one
throw DataXException.asDataXException(HdfsWriterErrorCode.ILLEGAL_VALUE,
String.format("仅仅支持单字符切分, 您配置的切分为 : [%s]", fieldDelimiter));
}
//compress check
this.compress = this.writerSliceConfig.getString(Key.COMPRESS,null);
if(fileType.equalsIgnoreCase("TEXT")){
Set<String> textSupportedCompress = Sets.newHashSet("GZIP", "BZIP2");
//用户可能配置的是compress:"",空字符串,需要将compress设置为null
if(StringUtils.isBlank(compress) ){
this.writerSliceConfig.set(Key.COMPRESS, null);
}else {
compress = compress.toUpperCase().trim();
if(!textSupportedCompress.contains(compress) ){
throw DataXException.asDataXException(HdfsWriterErrorCode.ILLEGAL_VALUE,
String.format("目前TEXT FILE仅支持GZIP、BZIP2 两种压缩, 不支持您配置的 compress 模式 : [%s]",
compress));
}
}
}else if(fileType.equalsIgnoreCase("ORC")){
Set<String> orcSupportedCompress = Sets.newHashSet("NONE", "SNAPPY");
if(null == compress){
this.writerSliceConfig.set(Key.COMPRESS, "NONE");
}else {
compress = compress.toUpperCase().trim();
if(!orcSupportedCompress.contains(compress)){
throw DataXException.asDataXException(HdfsWriterErrorCode.ILLEGAL_VALUE,
String.format("目前ORC FILE仅支持SNAPPY压缩, 不支持您配置的 compress 模式 : [%s]",
compress));
}
}
}else if(fileType.equalsIgnoreCase("PARQUET")){
Set<String> parquetSupportedCompress = Sets.newHashSet("NONE", "SNAPPY");
if(null == compress){
this.writerSliceConfig.set(Key.COMPRESS, "NONE");
}else {
compress = compress.toUpperCase().trim();
if(!parquetSupportedCompress.contains(compress)){
throw DataXException.asDataXException(HdfsWriterErrorCode.ILLEGAL_VALUE,
String.format("目前SNAPPY FILE仅支持SNAPPY压缩, 不支持您配置的 compress 模式 : [%s]",
compress));
}
}
}
//Kerberos check
Boolean haveKerberos = this.writerSliceConfig.getBool(Key.HAVE_KERBEROS, false);
if(haveKerberos) {
this.writerSliceConfig.getNecessaryValue(Key.KERBEROS_KEYTAB_FILE_PATH, HdfsWriterErrorCode.REQUIRED_VALUE);
this.writerSliceConfig.getNecessaryValue(Key.KERBEROS_PRINCIPAL, HdfsWriterErrorCode.REQUIRED_VALUE);
}
// encoding check
this.encoding = this.writerSliceConfig.getString(Key.ENCODING,Constant.DEFAULT_ENCODING);
try {
encoding = encoding.trim();
this.writerSliceConfig.set(Key.ENCODING, encoding);
Charsets.toCharset(encoding);
} catch (Exception e) {
throw DataXException.asDataXException(HdfsWriterErrorCode.ILLEGAL_VALUE,
String.format("不支持您配置的编码格式:[%s]", encoding), e);
}
}
@Override
public void startWrite(RecordReceiver lineReceiver) {
LOG.info("begin do write...");
LOG.info(String.format("write to file : [%s]", this.fileName));
if(fileType.equalsIgnoreCase("TEXT")){
//写TEXT FILE
hdfsHelper.textFileStartWrite(lineReceiver,this.writerSliceConfig, this.fileName,
this.getTaskPluginCollector());
}else if(fileType.equalsIgnoreCase("ORC")){
//写ORC FILE
hdfsHelper.orcFileStartWrite(lineReceiver,this.writerSliceConfig, this.fileName,
this.getTaskPluginCollector());
}else if(fileType.equalsIgnoreCase("PARQUET")){
//写PARQUET FILE
hdfsHelper.parquetFileStartWrite(lineReceiver,this.writerSliceConfig, this.fileName,
this.getTaskPluginCollector());
}
LOG.info("end do write");
}
- 然后修改 DataX/tree/master/hdfswriter/src/main/java/com/alibaba/datax/plugin/writer/hdfswriter/HdfsHelper.java
修改部分代码
public void parquetFileStartWrite(RecordReceiver lineReceiver, Configuration config, String fileName,
TaskPluginCollector taskPluginCollector) {
List<Configuration> columns = config.getListConfiguration(Key.COLUMN);
String compress = config.getString(Key.COMPRESS, null);
List<String> columnNames = getColumnNames(columns);
List<ObjectInspector> columnTypeInspectors = getColumnTypeInspectors(columns);
StructObjectInspector inspector = (StructObjectInspector) ObjectInspectorFactory
.getStandardStructObjectInspector(columnNames, columnTypeInspectors);
ParquetHiveSerDe parquetHiveSerDe = new ParquetHiveSerDe();
MapredParquetOutputFormat outFormat = new MapredParquetOutputFormat();
if (!"NONE".equalsIgnoreCase(compress) && null != compress) {
Class<? extends CompressionCodec> codecClass = getCompressCodec(compress);
if (null != codecClass) {
outFormat.setOutputCompressorClass(conf, codecClass);
}
}
try {
Properties colProperties = new Properties();
colProperties.setProperty("columns", String.join(",", columnNames));
List<String> colType = Lists.newArrayList();
columns.forEach(c -> colType.add(c.getString(Key.TYPE)));
colProperties.setProperty("columns.types", String.join(",", colType));
RecordWriter writer = (RecordWriter) outFormat.getHiveRecordWriter(conf, new Path(fileName), ObjectWritable.class, true, colProperties, Reporter.NULL);
Record record = null;
while ((record = lineReceiver.getFromReader()) != null) {
MutablePair<List<Object>, Boolean> transportResult = transportOneRecord(record, columns, taskPluginCollector);
if (!transportResult.getRight()) {
writer.write(null, parquetHiveSerDe.serialize(transportResult.getLeft(), inspector));
}
}
writer.close(Reporter.NULL);
} catch (Exception e) {
String message = String.format("写文件文件[%s]时发生IO异常,请检查您的网络是否正常!", fileName);
LOG.error(message);
Path path = new Path(fileName);
deleteDir(path.getParent());
throw DataXException.asDataXException(HdfsWriterErrorCode.Write_FILE_IO_ERROR, e);
}
}
sn
- 修改完成后 进行打包,首先必须有JDK1.8 和 maven 环境,进入DataX/ 下
打包命令
mvn -U clean package assembly:assembly -Dmaven.test.skip=true
打包完成后在DataX/target/ 下就是打包完成的datax 和压缩包
- 注: 现在的datax虽然支持将parquet 文件写入hdfs 但是不支持 snappy压缩,还需要一个小小的改动,将hdfshelper.java 的一行代码
codecClass = org.apache.hadoop.codec.SnappyCodec.class;
修改为
codecClass = parquet.hadoop.codec.SnappyCodec.class;
然后重新打包,此时就可以在json配置文件中添加 snappy压缩配置了