目录
一、简介
先来一张官网的图片,也许能够帮助我们更好理解Parquet的文件格式和内容。
parquet设计让它更好的压缩比例和更快的过滤速度。
一个File有多个Row Group
一个Row Group有多个Column
一个Column有多个Page
FileMetaData\Row Group metadata\column metadata
并行单元:
- MapReduce - File/Row Group
- IO - Column chunk
- Encoding/Compression - Page
二、schema(MessageType)
每个schema有一个根叫做message,message包含多个fields
每个field包含三个属性:
repetition, type, name
repetition可以是以下三种:
- required:有且只有1次
- optional:出现0次或者1次(最多出现一次)
- repeated:出现0次或者多次
type可以是:int32、int64、int96、float、double、boolean、binary、group
除了group,其他的被称为primitive类型,group组合了primitive类型。
message Book {
required binary bookName (UTF8);
required boolean market;
required double price;
repeated group author {
required binary name (UTF8);
required int32 age;
}
}
三、MessageType获取
3.1 从字符串构造
public static MessageType getMessageTypeFromString (){
String schemaString = "message Book {\n" +
" required binary bookName (UTF8);\n" +
" required boolean market;\n" +
" required double price;\n" +
" repeated group author {\n" +
" required binary name (UTF8);\n" +
" required int32 age;\n" +
" }\n" +
"}";
MessageType schema = MessageTypeParser.parseMessageType(schemaString);
return schema;
}
3.2 从代码创建
public static MessageType getMessageTypeFromCode(){
MessageType messageType = Types.buildMessage()
.required(PrimitiveType.PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named("bookName")
.required(PrimitiveType.PrimitiveTypeName.BOOLEAN).named("market")
.required(PrimitiveType.PrimitiveTypeName.DOUBLE).named("price")
.requiredGroup()
.required(PrimitiveType.PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named("name")
.required(PrimitiveType.PrimitiveTypeName.INT32).named("age")
.named("author")
.named("Book");
System.out.println(messageType.toString());
return messageType;
}
3.3 通过Parquet文件获取
public static MessageType getMessageType(Path path,Configuration configuration) throws IOException {
HadoopInputFile hadoopInputFile = HadoopInputFile.fromPath(path, configuration);
ParquetFileReader parquetFileReader = ParquetFileReader.open(hadoopInputFile, ParquetReadOptions.builder().build());
ParquetMetadata metaData = parquetFileReader.getFooter();
MessageType schema = metaData.getFileMetaData().getSchema();
//记得关闭
parquetFileReader.close();
return schema;
}
除了通过metadata获取,还可以通过文件获取,也可以通过下面的方式获取
org.apache.parquet.example.data.Group
org.apache.parquet.example.data.simple.SimpleGroup
// group可以是上面2个对象之一
System.out.println("schema:" + group.getType().toString());
读取schema之后记得关闭,不然可能会出现java.net.SocketException: Too many open files错误。
3.4 完整示例
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.ParquetReadOptions;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.hadoop.util.HadoopInputFile;
import org.apache.parquet.hadoop.util.HiddenFileFilter;
import org.apache.parquet.schema.*;
import java.io.IOException;
public class GetParquetSchema {
public static void main(String[] args) throws Exception{
//本地文件
String localPath = "file:///D:\\tmp\\parquet\\book.parquet";
//hdfs文件
String hdfsPath = "/tmp/parquet/book.parquet";
Configuration localConfiguration = new Configuration();
Configuration hdfsConfiguration = new Configuration();
hdfsConfiguration.set(FileSystem.FS_DEFAULT_NAME_KEY, "hdfs://192.168.8.206:9000");
MessageType newMessageType = getNewMessageType(localPath,localConfiguration);
System.out.println(newMessageType);
System.out.println("--------------");
newMessageType = getMessageType(hdfsPath,hdfsConfiguration);
System.out.println(newMessageType);
// getMessageTypeFromCode();
// getMessageTypeFromString();
}
public static MessageType getMessageType(Path path,Configuration configuration) throws IOException {
HadoopInputFile hadoopInputFile = HadoopInputFile.fromPath(path, configuration);
ParquetFileReader parquetFileReader = ParquetFileReader.open(hadoopInputFile, ParquetReadOptions.builder().build());
ParquetMetadata metaData = parquetFileReader.getFooter();
MessageType schema = metaData.getFileMetaData().getSchema();
//记得关闭
parquetFileReader.close();
return schema;
}
public static MessageType getMessageTypeFromCode(){
MessageType messageType = Types.buildMessage()
.required(PrimitiveType.PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named("bookName")
.required(PrimitiveType.PrimitiveTypeName.BOOLEAN).named("market")
.required(PrimitiveType.PrimitiveTypeName.DOUBLE).named("price")
.requiredGroup()
.required(PrimitiveType.PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named("name")
.required(PrimitiveType.PrimitiveTypeName.INT32).named("age")
.named("author")
.named("Book");
System.out.println(messageType.toString());
return messageType;
}
public static MessageType getMessageTypeFromString (){
String schemaString = "message Book {\n" +
" required binary bookName (UTF8);\n" +
" required boolean market;\n" +
" required double price;\n" +
" repeated group author {\n" +
" required binary name (UTF8);\n" +
" required int32 age;\n" +
" }\n" +
"}";
MessageType schema = MessageTypeParser.parseMessageType(schemaString);
return schema;
}
}
四、Parquet读写
4.1 读写本地文件
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.column.ParquetProperties;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.example.data.simple.SimpleGroupFactory;
import org.apache.parquet.hadoop.ParquetFileWriter;
import org.apache.parquet.hadoop.ParquetReader;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.example.ExampleParquetWriter;
import org.apache.parquet.hadoop.example.GroupReadSupport;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.MessageTypeParser;
import java.io.IOException;
import java.util.Random;
public class LocalParquetWritRead {
public static final String DATA_PATH = "file:///D:\\tmp\\parquet\\book.parquet";
private static String schemaStr = "message Book {\n" +
" required binary bookName (UTF8);\n" +
" required boolean market;\n" +
" required double price;\n" +
" repeated group author {\n" +
" required binary name (UTF8);\n" +
" required int32 age;\n" +
" }\n" +
"}";
private final static MessageType schema = MessageTypeParser.parseMessageType(schemaStr);
public static void main(String[] args) throws IOException {
// write();
read();
}
public static void write() throws IOException {
Path path = new Path(DATA_PATH);
Configuration configuration = new Configuration();
ExampleParquetWriter.Builder builder = ExampleParquetWriter
.builder(path).withWriteMode(ParquetFileWriter.Mode.CREATE)
.withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0)
.withCompressionCodec(CompressionCodecName.SNAPPY)
.withConf(configuration)
.withType(schema);
ParquetWriter<Group> writer = builder.build();
SimpleGroupFactory groupFactory = new SimpleGroupFactory(schema);
Random random = new Random();
for (int i = 0; i < 100; i++) {
Group group = groupFactory.newGroup();
group.append("bookName","bookName" + i)
.append("market",random.nextBoolean())
.append("price",random.nextDouble())
.addGroup("author")
.append("name","aname" + i)
.append("age",18 + random.nextInt(72));
writer.write(group);
}
writer.close();
}
public static void read() throws IOException {
Path path = new Path(DATA_PATH);
ParquetReader.Builder<Group> builder = ParquetReader.builder(new GroupReadSupport(), path);
ParquetReader<Group> reader = builder.build();
Group group;
while ((group = reader.read()) != null){
System.out.println("schema:" + group.getType().toString());
System.out.println(group.getString("bookName",0));
System.out.println(group.getBoolean("market",0));
System.out.println(group.getDouble("price",0));
Group author = group.getGroup("author", 0);
System.out.println(author.getString("name",0));
System.out.println(author.getInteger("age",0));
}
}
}
4.2 读写HDFS文件
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.column.ParquetProperties;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.example.data.simple.SimpleGroupFactory;
import org.apache.parquet.hadoop.ParquetFileWriter;
import org.apache.parquet.hadoop.ParquetReader;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.example.ExampleParquetWriter;
import org.apache.parquet.hadoop.example.GroupReadSupport;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.MessageTypeParser;
import java.io.IOException;
import java.util.Random;
public class HdfsParquetWritRead {
public static final String DATA_PATH = "/tmp/parquet/book.parquet";
private static String schemaStr = "message Book {\n" +
" required binary bookName (UTF8);\n" +
" required boolean market;\n" +
" required double price;\n" +
" repeated group author {\n" +
" required binary name (UTF8);\n" +
" required int32 age;\n" +
" }\n" +
"}";
private final static MessageType schema = MessageTypeParser.parseMessageType(schemaStr);
public static void main(String[] args) throws IOException {
//避免Permission denied: user=xxx, access=WRITE, inode="/tmp/parquet":root:supergroup:drwxr-xr-x
System.setProperty("HADOOP_USER_NAME","root");
write();
// read();
}
public static void write() throws IOException {
Configuration configuration = new Configuration();
configuration.set(FileSystem.FS_DEFAULT_NAME_KEY, "hdfs://192.168.8.206:9000");
Path path = new Path(DATA_PATH);
ExampleParquetWriter.Builder builder = ExampleParquetWriter
.builder(path).withWriteMode(ParquetFileWriter.Mode.CREATE)
.withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0)
.withCompressionCodec(CompressionCodecName.SNAPPY)
.withConf(configuration)
.withType(schema);
ParquetWriter<Group> writer = builder.build();
SimpleGroupFactory groupFactory = new SimpleGroupFactory(schema);
Random random = new Random();
for (int i = 0; i < 100; i++) {
Group group = groupFactory.newGroup();
group.append("bookName","bookName" + i)
.append("market",random.nextBoolean())
.append("price",random.nextDouble())
.addGroup("author")
.append("name","aname" + i)
.append("age",18 + random.nextInt(72));
writer.write(group);
}
writer.close();
}
public static void read() throws IOException {
Path path = new Path(DATA_PATH);
Configuration configuration = new Configuration();
configuration.set(FileSystem.FS_DEFAULT_NAME_KEY, "hdfs://192.168.8.206:9000");
GroupReadSupport groupReadSupport = new GroupReadSupport();
ParquetReader.Builder<Group> builder = ParquetReader.builder(groupReadSupport, path).withConf(configuration);
ParquetReader<Group> reader = builder.build();
Group group;
while ((group = reader.read()) != null){
System.out.println("schema:" + group.getType().toString());
System.out.println(group.getString("bookName",0));
System.out.println(group.getBoolean("market",0));
System.out.println(group.getDouble("price",0));
Group author = group.getGroup("author", 0);
System.out.println(author.getString("name",0));
System.out.println(author.getInteger("age",0));
}
}
}
group.getString(“bookName”,0)第二个参数0是为可重复字段(repeated)准备的,获取第几个值,从0开始,所以对应required类型,都是0。
五、合并Parquet小文件
合并Parquet文件应该是一个非常实用的操作,因为很多时候我们使用类似于Kafka这样的消息系统来收集数据的时候,就可能会产生很多小文件。
过多的小文件会让应用变慢,所以我们需要合并小的文件。
package org.curitis.parquet;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.apache.parquet.ParquetReadOptions;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.ParquetFileWriter;
import org.apache.parquet.hadoop.ParquetReader;
import org.apache.parquet.hadoop.ParquetWriter;
import org.apache.parquet.hadoop.example.ExampleParquetWriter;
import org.apache.parquet.hadoop.example.GroupReadSupport;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.hadoop.util.HadoopInputFile;
import org.apache.parquet.schema.MessageType;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
public class MergeHdfsParquetFile {
private static FileSystem fileSystem;
static {
System.setProperty("HADOOP_USER_NAME","root");
try {
fileSystem = FileSystem.get(getConfiguration());
} catch (IOException e) {
System.exit(1);
}
}
public static void main(String[] args) throws Exception {
mergeParquet("/tmp/merge");
}
private static void mergeParquet(String dir) throws Exception {
MessageType messageType = checkSchemaSame(dir);
if(messageType == null){//MessageType不一致
return;
}
List<Path> parquetPaths = getParquetPaths(dir);
String dest = dir + "/merge-" + System.currentTimeMillis() + ".parquet";
Path destPath = new Path(dest);
ParquetWriter parquetWriter = getParquetWriter(messageType, destPath);
ParquetReader<Group> parquetReader;
Group book;
for(Path path : parquetPaths) {
parquetReader = getParquetReader(path);
while ((book = parquetReader.read()) != null) {
parquetWriter.write(book);
}
}
parquetWriter.close();
if(fileSystem.exists(destPath)){
FileStatus fileStatus = fileSystem.getFileStatus(destPath);
if(fileStatus.getLen() <= 1024){
System.err.println(dir + "files len to small pleach ack need delete");
}else {
for(Path path : parquetPaths){
fileSystem.delete(path,false);
}
}
}
}
public static List<Path> getParquetPaths(String dir) throws Exception {
Path dirPath = new Path(dir);
RemoteIterator<LocatedFileStatus> locatedFileStatusRemoteIterator = fileSystem.listFiles(dirPath, false);
List<Path> fileList = new ArrayList<Path>();
while (locatedFileStatusRemoteIterator.hasNext()) {
LocatedFileStatus next = locatedFileStatusRemoteIterator.next();
Path path = next.getPath();
FileStatus fileStatus = fileSystem.getFileStatus(path);
if(fileStatus.isFile() && path.getName().endsWith(".parquet")) {//如果是parquet文件
fileList.add(path);
}
}
return fileList;
}
private static MessageType checkSchemaSame(String dir) throws Exception {
List<MessageType> groupTypes = getMessageType(dir);
int size = groupTypes.size();
if(size == 0 || size == 1){//0个和1个都不处理
return null;
}
MessageType groupType = groupTypes.get(0);
for(MessageType gt : groupTypes){
if(!groupType.equals(gt)){
return null;
}
}
return groupType;
}
private static List<MessageType> getMessageType(String dir) throws Exception {
List<Path> parquetPaths = getParquetPaths(dir);
LinkedList<MessageType> groupTypes = new LinkedList<>();
for(Path path : parquetPaths){
groupTypes.add(getMessageType(path));
}
return groupTypes;
}
public static MessageType getMessageType(Path path) throws IOException {
HadoopInputFile hadoopInputFile = HadoopInputFile.fromPath(path, getConfiguration());
ParquetFileReader parquetFileReader = ParquetFileReader.open(hadoopInputFile, ParquetReadOptions.builder().build());
ParquetMetadata metaData = parquetFileReader.getFooter();
MessageType schema = metaData.getFileMetaData().getSchema();
parquetFileReader.close();
return schema;
}
private static Configuration getConfiguration(){
Configuration configuration = new Configuration();
//path中就不用加hdfs://127.0.0.1:9000了
configuration.set(FileSystem.FS_DEFAULT_NAME_KEY, "hdfs://127.0.0.1:9000");
return configuration;
}
public static ParquetReader getParquetReader(Path path) throws IOException {
GroupReadSupport readSupport = new GroupReadSupport();
ParquetReader.Builder<Group> builder = ParquetReader.builder(readSupport, path);
builder.withConf(getConfiguration());
ParquetReader<Group> parquetReader =builder.build();
return parquetReader;
}
public static ParquetWriter getParquetWriter(MessageType schema, Path path) throws IOException {
ExampleParquetWriter.Builder writebuilder = ExampleParquetWriter.builder(path);
writebuilder.withWriteMode(ParquetFileWriter.Mode.CREATE);
writebuilder.withCompressionCodec(CompressionCodecName.SNAPPY);
writebuilder.withConf(getConfiguration());
writebuilder.withType(schema);
ParquetWriter writer = writebuilder.build();
return writer;
}
}
上面会先检查目录下的parquet的schema是否一致,一致才合并。
如果schema不同,就不能直接写group,就要分字段写,如果没有嵌套的group还是比较好处理的,如果有嵌套的group就稍微麻烦一些。
六、pom文件
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.curitis</groupId>
<artifactId>hadoop-learn</artifactId>
<version>1.0.0</version>
<properties>
<spring.version>5.1.3.RELEASE</spring.version>
<junit.version>4.11</junit.version>
<hadoop.version>3.0.2</hadoop.version>
<parquet.version>1.10.1</parquet.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
<exclusions>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.version}</version>
<exclusions>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>${hadoop.version}</version>
<exclusions>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- parquet -->
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-hadoop</artifactId>
<version>${parquet.version}</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-column</artifactId>
<version>${parquet.version}</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-common</artifactId>
<version>${parquet.version}</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-encoding</artifactId>
<version>${parquet.version}</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.56</version>
</dependency>
<!--log-->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.7</version>
</dependency>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-core</artifactId>
<version>1.1.7</version>
</dependency>
<dependency>
<groupId>ch.qos.logback</groupId>
<artifactId>logback-classic</artifactId>
<version>1.1.7</version>
</dependency>
<!--test-->
<dependency>
<groupId>org.springframework</groupId>
<artifactId>spring-test</artifactId>
<version>${spring.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>${junit.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
<dependencyManagement>
<dependencies>
<dependency>
<groupId>io.netty</groupId>
<artifactId>netty-all</artifactId>
<version>4.1.25.Final</version>
</dependency>
</dependencies>
</dependencyManagement>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>8</source>
<target>8</target>
</configuration>
</plugin>
</plugins>
</build>
</project>