场景: 将接口数据接入数仓
hive 版本:2.1.1
hadoop 版本: 3.0.0
引入maven 依赖:
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.60</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-hadoop</artifactId>
<version>1.12.0</version>
</dependency>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-column</artifactId>
<version>1.12.0</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>${hive.version}</version>
</dependency>
数据写入hdfs 关键代码段:
使用append 模式
/**
* @desc append 模式写parquet 文件
* @param filePath
* @param FILE_SCHEMA
* @return
* @throws IOException
*/
private ParquetWriter<Group> getWriter(String filePath, MessageType FILE_SCHEMA) throws IOException {
Path path = new Path(filePath);
return ExampleParquetWriter.builder(path)
.withWriteMode(ParquetFileWriter.Mode.CREATE)
.withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0)
.withCompressionCodec(CompressionCodecName.UNCOMPRESSED)
.withConf(conf)
.withType(FILE_SCHEMA).build();
}
将数据集写分区表的hdfs 文件中
/**
* @desc 将数据集写分区表
* @param partition
* @param tableId
* @param rows
*/
public void write(String partition, String tableId, List<Row> rows) {
try {
//定义表结构
MessageType messageType = Types.buildMessage()
.optional(PrimitiveType.PrimitiveTypeName.BINARY)
.as(LogicalTypeAnnotation.stringType())
.named("column1")
.optional(PrimitiveType.PrimitiveTypeName.BINARY)
.as(LogicalTypeAnnotation.stringType())
.named("column2")
.optional(PrimitiveType.PrimitiveTypeName.BINARY)
.as(LogicalTypeAnnotation.stringType())
.named("column3")
.optional(PrimitiveType.PrimitiveTypeName.BINARY)
.as(LogicalTypeAnnotation.stringType())
.named("column4")
.optional(PrimitiveType.PrimitiveTypeName.BINARY)
.as(LogicalTypeAnnotation.stringType())
.named("column5")
.optional(PrimitiveType.PrimitiveTypeName.BINARY)
.as(LogicalTypeAnnotation.stringType())
.named("column6")
.optional(PrimitiveType.PrimitiveTypeName.BINARY)
.as(LogicalTypeAnnotation.stringType())
.named("column7")
.optional(PrimitiveType.PrimitiveTypeName.BINARY)
.as(LogicalTypeAnnotation.stringType())
.named("column8")
.optional(PrimitiveType.PrimitiveTypeName.BINARY)
.as(LogicalTypeAnnotation.stringType())
.named("column9")
.optional(PrimitiveType.PrimitiveTypeName.BINARY)
.as(LogicalTypeAnnotation.stringType())
.named("column10")
.optional(PrimitiveType.PrimitiveTypeName.BINARY)
.as(LogicalTypeAnnotation.stringType())
.named("column11").named(tableId);
//查看多条数据是否可以一次性写入一个Parquet文件
ParquetWriter<Group> writer = getWriter(FS_PATH + tableId + "/" + partition + "/" + uuid, messageType);
SimpleGroupFactory simpleGroupFactory = new SimpleGroupFactory(messageType);
//校验目录
for (Row row : rows) {
try {
Group group = simpleGroupFactory.newGroup();
group.append("column1", "value1");
group.append("column2", "value2");
group.append("column3", "value3");
writer.write(group);
} catch (Exception e) {
logger.error(rows.toString());
}
}
writer.close();
} catch (Exception e) {
logger.error(ExceptionUtils.getStackTrace(e));
}
}
最后将 hdfs 中的 parquet 数据load 到 hive 分区外表中