/**
* 读取parquet内容
*
* @param inPath
*/
public int parquetReader(KafkaTemplate<String, String> kafkaTemplate, String topicName, String inPath) throws Exception {
int fileSize = 0;
GroupReadSupport readSupport = new GroupReadSupport();
ParquetReader.Builder<Group> reader = ParquetReader.builder(readSupport, new Path(inPath));
ParquetReader<Group> build = reader.build();
// 获取第一行数据
Group line = build.read();
if (line != null) {
// 处理此行数据为json
JSONObject jsonObject = new JSONObject();
List<Type> fields = line.getType().getFields();
for (Type type : fields) {
converterType2Java(line, type, jsonObject);
}
// 发送kafka
this.sendKafka(kafkaTemplate, topicName, jsonObject.toJSONString());
fileSize++;
}
// 处理剩余所有数据为json
while ((line = build.read()) != null) {
JSONObject jsonObject = new JSONObject();
List<Type> fields = line.getType().getFields();
for (Type type : fields) {
converterType2Java(line, type, jsonObject);
}
// 发送kafka
this.sendKafka(kafkaTemplate, topicName, jsonObject.toJSONString());
fileSize++;
}
logger.info("***sendKafka, filePath:{}, fileSize:{}***", inPath, fileSize);
return fileSize;
}
/**
* 处理文件
*
* @param line
* @param type
* @return
*/
public static void converterType2Java(Group line, Type type, JSONObject jsonObject) {
String value = "";
String fieldType = type.asPrimitiveType().getPrimitiveTypeName().name();
String fieldName = type.getName();
int repetition = line.getFieldRepetitionCount(type.getName());
if (repetition != 0) {
switch (fieldType) {
case "BOOLEAN":
value = String.valueOf(line.getBoolean(fieldName, 0));
break;
case "INT32":
value = String.valueOf(line.getInteger(fieldName, 0));
break;
case "INT64":
value = String.valueOf(line.getLong(fieldName, 0));
break;
case "INT96":
value = String.valueOf(getTimestampMillis(line.getInt96(fieldName, 0)));
break;
case "FLOAT":
value = String.valueOf(line.getFloat(fieldName, 0));
break;
case "DOUBLE":
value = String.valueOf(line.getDouble(fieldName, 0));
break;
case "FIXED_LEN_BYTE_ARRAY":
if (type.getOriginalType() != null && type.getOriginalType().name().equals("DECIMAL")) {
int precision = type.asPrimitiveType().getDecimalMetadata().getPrecision();
int scale = type.asPrimitiveType().getDecimalMetadata().getScale();
BigDecimal decimalValue = binaryToDecimal(precision, scale, line.getBinary(fieldName, 0).getBytes());
String precisionFormat = String.join("", Collections.nCopies(precision - 1, "#"));
String scaleFormat = String.join("", Collections.nCopies(scale, "0"));
String format = precisionFormat + "0." + scaleFormat;
DecimalFormat decimalFormat = new DecimalFormat(format);
value = decimalFormat.format(decimalValue);
}
break;
default:
value = line.getString(fieldName, 0);
}
jsonObject.put(fieldName, value);
}
}
private static final int JULIAN_EPOCH_OFFSET_DAYS = 2440588;
private static final long NANOS_PER_MILLISECOND = TimeUnit.MICROSECONDS.toNanos(1);
private static final long MILLIS_IN_DAY = TimeUnit.DAYS.toMillis(1);
public static long getTimestampMillis(Binary timestampBinary) {
if (timestampBinary.length() != 12) {
return 0;
}
byte[] bytes = timestampBinary.getBytes();
long timeOfDayNanos = Longs.fromBytes(bytes[7], bytes[6], bytes[5], bytes[4], bytes[3], bytes[2], bytes[1], bytes[0]);
int julianDay = Ints.fromBytes(bytes[11], bytes[10], bytes[9], bytes[8]);
return julianDayToMillis(julianDay) + (timeOfDayNanos / NANOS_PER_MILLISECOND);
}
private static long julianDayToMillis(int julianDay) {
return (julianDay - JULIAN_EPOCH_OFFSET_DAYS) * MILLIS_IN_DAY;
}
static BigDecimal binaryToDecimal(int precision, int scale, byte[] bytes) {
// Precision <= 18 checks for the max number of digits for an unscaled long, else treat with big integer conversion
if (precision <= 18) {
int start = 0;//buffer.arrayOffset() + buffer.position();
int end = bytes.length; //buffer.arrayOffset() + buffer.limit();
long unscaled = 0L;
int i = start;
while (i < end) {
unscaled = (unscaled << 8 | bytes[i] & 0xff);
i++;
}
int bits = 8 * (end - start);
long unscaledNew = (unscaled << (64 - bits)) >> (64 - bits);
BigDecimal result;
if (unscaledNew <= -pow(10, 18) || unscaledNew >= pow(10, 18)) {
result = new BigDecimal(unscaledNew);
} else {
result = BigDecimal.valueOf(unscaledNew / pow(10, scale));
}
return result;
} else {
return new BigDecimal(new BigInteger(bytes), scale);
}
}
parquet文件转JsonObject
最新推荐文章于 2024-05-16 10:23:10 发布