文件:
https://pan.baidu.com/s/1G3cMVqVMgVm1f2p7aVKhXw
3wj5
依赖:
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-hadoop</artifactId>
<version>1.7.0</version>
</dependency>
代码:
import com.nature.third.utils.StringUtils;
import freemarker.template.SimpleDate;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.example.data.simple.SimpleGroup;
import org.apache.parquet.hadoop.ParquetReader;
import org.apache.parquet.hadoop.example.GroupReadSupport;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.OriginalType;
import org.apache.parquet.schema.Type;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.List;
public class ParquetUtil {
public static void main(String[] args) throws IOException, ParseException {
String path = "hdfs://192.168.123.123:9000/types2_1617865619036.parquet";
path = "C:\\Users\\1112\\Desktop\\types2_1617865619036.parquet";
String res= getColumn(path);
}
public static String getColumn(String inPath) throws IOException, ParseException {
String res = "";
if(StringUtils.isNotEmpty(inPath)){
GroupReadSupport readSupport = new GroupReadSupport();
ParquetReader.Builder<Group> reader= ParquetReader.builder(readSupport, new Path(inPath));
ParquetReader<Group> build= null;
build = reader.build();
//读取内容,想要内容便利line即可,方式类似io
Group line = build.read();
List<Type> typeList = line.getType().getFields();
String colName = "";
String colType = "";
StringBuilder sbu = new StringBuilder();
StringBuilder sbuV = new StringBuilder();
StringBuilder sbuNew = new StringBuilder();
if(typeList!=null && typeList.size()>0){
for(Type type : typeList){
colName = type.getName();
colType = getJavaType(type);
sbuV.append(colName).append("\t");
sbu.append("字段名:"+colName).append("\t").append("字段类型:"+colType).append("\n");
}
sbuNew.append("new_date").append("\t");
sbuNew.append("new_time").append("\n");
if(sbu.length()>0){
res = sbu.substring(0,sbu.length()-1);
}
}
System.out.println(res);
int size = typeList.size();
String value = "";
String new_time = "";
String new_date = "";
while (line!=null){
sbuV.append("\n");
sbuNew.append("\n");
for (int i = 0; i < size; i++) {
value = ((SimpleGroup) line).getValueToString(i,0);
sbuV.append(value).append("\t");
Integer timeDay = ((SimpleGroup) line).getInteger("date",0);
Long time = timeDay*24*60*60*1000L;
SimpleDateFormat sdf1 = new SimpleDateFormat("yyyy-MM-dd");
new_date = sdf1.format(time);
Binary bin = ((SimpleGroup) line).getInt96("time",0);
if(bin!=null){
Long longTime = ParquetTimestampUtils.getTimestampMillis(bin);
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
new_time = sdf.format(longTime);
}
}
sbuNew.append(new_date).append("\t");
sbuNew.append(new_time);
line = build.read();
}
System.out.println(sbuV.toString());
System.out.println(sbuNew.toString());
}
return res;
}
private static String getJavaType(Type type){
String stringType = "字符型";
String dateType = "日期型";
String intType = "整型";
String doubleType = "浮点型";
String textType = "文本型";
String res = "";
String schemaType = type.asPrimitiveType().getPrimitiveTypeName().name();
if("int32".equalsIgnoreCase(schemaType)){
OriginalType originalType = type.getOriginalType();
if(originalType!=null && "date".equalsIgnoreCase(originalType.name())){
schemaType = "date";
}
}
switch (schemaType.toLowerCase()) {
case "date":
res = dateType;
break;
case "int96":
res = dateType;
break;
case "int32"://字符串
res = intType;
break;
case "int64"://字符串
res = intType;
break;
case "float"://浮点型
res = doubleType;
break;
case "double"://浮点型
res = doubleType;
break;
case "boolean"://字符串
res = stringType;
break;
case "binary"://字符串
res = stringType;
break;
default:
res = stringType;
}
return res;
}
}
import java.util.concurrent.TimeUnit;
import org.apache.parquet.io.api.Binary;
import com.google.common.primitives.Ints;
import com.google.common.primitives.Longs;
public class ParquetTimestampUtils {
/**
* julian date的偏移量,2440588相当于1970/1/1
*/
private static final int JULIAN_EPOCH_OFFSET_DAYS = 2440588;
private static final long MILLIS_IN_DAY = TimeUnit.DAYS.toMillis(1);
private static final long NANOS_PER_MILLISECOND = TimeUnit.MILLISECONDS.toNanos(1);
private ParquetTimestampUtils() {}
/**
* Returns GMT timestamp from binary encoded parquet timestamp (12 bytes - julian date + time of day nanos).
*
* @param timestampBinary INT96 parquet timestamp
* @return timestamp in millis, GMT timezone
*/
public static long getTimestampMillis(Binary timestampBinary)
{
if (timestampBinary.length() != 12) {
return 0;
// throw new PrestoException(HIVE_BAD_DATA, "Parquet timestamp must be 12 bytes, actual " + timestampBinary.length());
}
byte[] bytes = timestampBinary.getBytes();
// little endian encoding - need to invert byte order
long timeOfDayNanos = Longs.fromBytes(bytes[7], bytes[6], bytes[5], bytes[4], bytes[3], bytes[2], bytes[1], bytes[0]);
int julianDay = Ints.fromBytes(bytes[11], bytes[10], bytes[9], bytes[8]);
return julianDayToMillis(julianDay) + (timeOfDayNanos / NANOS_PER_MILLISECOND);
}
private static long julianDayToMillis(int julianDay)
{
return (julianDay - JULIAN_EPOCH_OFFSET_DAYS) * MILLIS_IN_DAY;
}
}
源文件数据:
输出结果
new_date 和 new_time 属于二次转换的结果