获得parquet文件的schema 合并parquet小文件

获得parquet文件的schema

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.hadoop.util.HiddenFileFilter;
import org.apache.parquet.schema.MessageType;
import static org.apache.parquet.format.converter.ParquetMetadataConverter.NO_FILTER;

public class ParquetSchemaPrint {

    public static void main(String[] args) throws Exception{
        String input = args[0];

        Configuration conf = new Configuration();
        ParquetMetadata metaData;

        Path path = new Path(input);
        FileSystem fs = path.getFileSystem(conf);
        Path file;
        if (fs.isDirectory(path)) {
            FileStatus[] statuses = fs.listStatus(path, HiddenFileFilter.INSTANCE);
            if (statuses.length == 0) {
                throw new RuntimeException("Directory " + path.toString() + " is empty");
            }
            file = statuses[0].getPath();
        } else {
            file = path;
        }
        metaData = ParquetFileReader.readFooter(conf, file, NO_FILTER);
        MessageType schema = metaData.getFileMetaData().getSchema();

        System.out.println(schema);
    }
}

输出内容如下:

message hive_schema {
  optional binary code (UTF8);
  optional int32 day;
  optional double kp;
  optional double zg;
  optional double zd;
  optional double sp;
  optional double cjl;
  optional double cje;
}

合并parquet文件

合并文件的前提是知道文件的schema,之前的代码中已经获得了parquet文件的schema,具体合并代码如下

/**
     * @param src parquet文件路径
     * @param fileBatch 多少个文件合并成一个
     * @throws Exception
     */
    public static void combineParquet(String src,int fileBatch)throws Exception{

        String schemastr = "message hive_schema {\n" +
                "  optional binary code (UTF8);\n" +
                "  optional int32 day;\n" +
                "  optional double kp;\n" +
                "  optional double zg;\n" +
                "  optional double zd;\n" +
                "  optional double sp;\n" +
                "  optional double cjl;\n" +
                "  optional double cje;\n" +
                "}\n" +
                "\n";
        Configuration configuration = new Configuration();
        MessageType schema = MessageTypeParser.parseMessageType(schemastr);
        List<Path> fileList = getPaths(configuration, src);
        String outPath = src+"/combine"+System.currentTimeMillis()+"_0_"+".parquet";

        Path path = new Path(outPath);
        GroupWriteSupport writeSupport = new GroupWriteSupport();
        writeSupport.setSchema(schema,configuration);
        ParquetWriter<Group> writer = new ParquetWriter<Group>(path,configuration,writeSupport);


        for(int i=0;i<fileList.size();i++){

            if(i>0&&i%fileBatch==0){
                writer.close();
                outPath = src+"/combine"+System.currentTimeMillis()+"_"+i+"_"+".parquet";
                path = new Path(outPath);
                writer = new ParquetWriter<Group>(path,configuration,writeSupport);
            }
            System.out.println("正在合并文件:"+fileList.get(i));
            GroupReadSupport readSupport = new GroupReadSupport();
            ParquetReader.Builder<Group> reader = ParquetReader.builder(readSupport, fileList.get(i));
            ParquetReader<Group> build = reader.build();
            Group line = null;
            while ((line = build.read()) != null) {
                writer.write(line);
            }
        }

        writer.close();
        System.out.println("读取结束");

    }

    /**
     * 获得目录下的所有文件
     * @param conf
     * @param src
     * @return
     * @throws Exception
     */
    public static List<Path> getPaths(Configuration conf,String src)throws Exception{
        FileSystem fs = FileSystem.get(conf);
        RemoteIterator<LocatedFileStatus> locatedFileStatusRemoteIterator = fs.listFiles(new Path(src), true);
        List<Path> fileList = new ArrayList<Path>();
        while(locatedFileStatusRemoteIterator.hasNext()){
            LocatedFileStatus next = locatedFileStatusRemoteIterator.next();
            Path path = next.getPath();
            fileList.add(path);
        }

        return fileList;
    }
  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值