数据仓库HIVE存储数据一般采用parquet格式,但Alibaba datax开源版不支持parquet格式,在网上查了很多资料,写的大多不完整,特此总结出完整版记录一下,供大家参考。
操作步骤
1.从gitee 拉取datax代码,对hdfsreader模块进行改造,主要改造以下几个类。
pom里面添加parquet支持依赖
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-avro</artifactId>
<version>1.12.0</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-common</artifactId>
<version>1.12.0</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-protobuf</artifactId>
<version>1.12.0</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-protobuf</artifactId>
<version>1.12.0</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-hadoop</artifactId>
<version>1.12.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.1</version>
</dependency>
Constant如下
public class Constant {
public static final String SOURCE_FILES = "sourceFiles";
public static final String TEXT = "TEXT";
public static final String ORC = "ORC";
public static final String CSV = "CSV";
public static final String SEQ = "SEQ";
public static final String RC = "RC";
public static final String PARQUET= "PARQUET";
}
HdfsFileType
public enum HdfsFileType {
ORC, SEQ, RC, CSV, TEXT,PARQUET,
}
DFSUtil添加读取parquet方法,根据orc读取方法改造
public void parquetFileStartRead(String sourceParquetFilePath, Configuration readerSliceConfig,
RecordSender recordSender, TaskPluginCollector taskPluginCollector) {
LOG.info(String.format("Start Read parquetfile [%s].", sourceParquetFilePath));
List<ColumnEntry> column = UnstructuredStorageReaderUtil
.getListColumnEntry(readerSliceConfig, com.alibaba.datax.plugin.unstructuredstorage.reader.Key.COLUMN);
String nullFormat = readerSliceConfig.getString(NULL_FORMAT);
boolean isReadAllColumns = false;
Path parquetFilePath = new Path(sourceParquetFilePath);
try {
GroupReadSupport readSupport = new GroupReadSupport();
ParquetReader.Builder<Group> reader= ParquetReader.builder(readSupport,parquetFilePath);
ParquetReader<Group> build= reader.build();
Group line = build.read();
List<org.apache.parquet.schema.Type> typeList = line.getType().getFields();
int size = typeList.size();
List<Object> recordFields = null;
int k=0; //line = build.read()会忽略第一条数据,定义变量k解决
while (k==0||((line = build.read()) != null)) {
k++;
recordFields = new ArrayList<Object>();
for (int i = 0; i < size; i++) {
//解决int96和int32问题
String schemaType = typeList.get(i).asPrimitiveType().getPrimitiveTypeName().name();
if (schemaType.toLowerCase().equalsIgnoreCase("int96")) {
Binary bin = line.getInt96(typeList.get(i).getName(), 0);
if (bin != null) {
Long longTime = ParquetTimestampUtils.getTimestampMillis(bin);
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
recordFields.add(sdf.format(longTime));
}
} else if (schemaType.equalsIgnoreCase("int32")) {
Integer timeDay = line.getInteger(typeList.get(i).getName(), 0);
Long time = timeDay * 24 * 60 * 60 * 1000L;
SimpleDateFormat sdf1 = new SimpleDateFormat("yyyy-MM-dd");
recordFields.add(sdf1.format(time));
} else {
try {
recordFields.add(line.getValueToString(i, 0));
}catch (Exception e){
recordFields.add("");
}
}
}
transportOneRecord(column, recordFields, recordSender,
taskPluginCollector, isReadAllColumns, nullFormat);
}
build.close();
} catch (Exception e) {
String message = String.format("从parquetfile文件路径[%s]中读取数据发生异常,[%s],请联系系统管理员。"
, sourceParquetFilePath, e);
LOG.error(message);
throw DataXException.asDataXException(HdfsReaderErrorCode.READ_FILE_ERROR, message);
}
}
//判断文件是否是parquet
private static boolean isParquetFile(Path file) {
try {
org.apache.parquet.hadoop.example.GroupReadSupport readSupport = new GroupReadSupport();
ParquetReader.Builder<org.apache.parquet.example.data.Group> reader = ParquetReader.builder(readSupport, file);
ParquetReader<Group> build = reader.build();
if (build.read() != null) {
return true;
}
} catch (IOException e) {
}
return false;
}
package com.alibaba.datax.plugin.reader.hdfsreader;
import com.google.common.primitives.Ints;
import com.google.common.primitives.Longs;
import org.apache.parquet.io.api.Binary;
import java.util.concurrent.TimeUnit;
public class ParquetTimestampUtils {
private static final int JULIAN_EPOCH_OFFSET_DAYS = 2440588;
private static final long MILLIS_IN_DAY = TimeUnit.DAYS.toMillis(1);
private static final long NANOS_PER_MILLISECOND = TimeUnit.MILLISECONDS.toNanos(1);
private ParquetTimestampUtils() {}
/**
* Returns GMT timestamp from binary encoded parquet timestamp (12 bytes - julian date + time of day nanos).
*
* @param timestampBinary INT96 parquet timestamp
* @return timestamp in millis, GMT timezone
*/
public static long getTimestampMillis(Binary timestampBinary)
{
if (timestampBinary.length() != 12) {
return 0;
// throw new PrestoException(HIVE_BAD_DATA, "Parquet timestamp must be 12 bytes, actual " + timestampBinary.length());
}
byte[] bytes = timestampBinary.getBytes();
// little endian encoding - need to invert byte order
long timeOfDayNanos = Longs.fromBytes(bytes[7], bytes[6], bytes[5], bytes[4], bytes[3], bytes[2], bytes[1], bytes[0]);
int julianDay = Ints.fromBytes(bytes[11], bytes[10], bytes[9], bytes[8]);
return julianDayToMillis(julianDay) + (timeOfDayNanos / NANOS_PER_MILLISECOND);
}
private static long julianDayToMillis(int julianDay)
{
return (julianDay - JULIAN_EPOCH_OFFSET_DAYS) * MILLIS_IN_DAY;
}
}
最后是HdfsReader,添加parquet
重新打包hdfsreader,将包替换到datax引擎即可。