Hudi 集成 Minio
一、基于的组件
- Hadoop-aliyun:3.2.1.jar
- minio:7.0.2.jar
二、Hudi写入OSS代码
1. 引入
云上对象存储的廉价让不少公司将其作为主要的存储方案,而Hudi作为数据湖解决方案,支持对象存储也是必不可少。之前AWS EMR已经内置集成Hudi,也意味着可以在S3上无缝使用Hudi。当然国内用户可能更多使用阿里云OSS作为云上存储方案,那么如果用户想基于OSS构建数据湖,那么Hudi是否支持呢?随着Hudi社区主分支已经合并了支持OSS的PR,现在只需要基于master分支build版本即可,或者等待下一个版本释出便可直接使用,经过简单的配置便可将数据写入OSS。
2. 配置
2.1 pom依赖
需要额外添加的主要pom依赖如下
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-aliyun</artifactId>
<version>3.2.1</version>
</dependency>
<dependency>
<groupId>com.aliyun.oss</groupId>
<artifactId>aliyun-sdk-oss</artifactId>
<version>3.8.1</version>
</dependency>
2.2 core-site.xml配置
若需访问OSS,需要修改core-site.xml,关键配置如下
<property>
<name>fs.defaultFS</name>
<value>oss://bucketname/</value>
</property>
<property>
<name>fs.oss.endpoint</name>
<value>oss-endpoint-address</value>
<description>Aliyun OSS endpoint to connect to.</description>
</property>
<property>
<name>fs.oss.accessKeyId</name>
<value>oss_key</value>
<description>Aliyun access key ID</description>
</property>
<property>
<name>fs.oss.accessKeySecret</name>
<value>oss-secret</value>
<description>Aliyun access key secret</description>
</property>
<property>
<name>fs.oss.impl</name>
<value>org.apache.hadoop.fs.aliyun.oss.AliyunOSSFileSystem</value>
</property>
3. 源码
示例源码如下
import org.apache.hudi.QuickstartUtils.*;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import java.io.IOException;
import java.util.List;
import static org.apache.hudi.QuickstartUtils.convertToStringList;
import static org.apache.hudi.QuickstartUtils.getQuickstartWriteConfigs;
import static org.apache.hudi.config.HoodieWriteConfig.TABLE_NAME;
import static org.apache.spark.sql.SaveMode.Overwrite;
public class OssHudiDemo {
public static void main(String[] args) throws IOException {
SparkSession spark = SparkSession.builder().appName("Hoodie Datasource test")
.master("local[2]")
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.config("spark.io.compression.codec", "snappy")
.config("spark.sql.hive.convertMetastoreParquet", "false")
.getOrCreate();
JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
String tableName = "hudi_trips_cow";
String basePath = "/tmp/hudi_trips_cow";
DataGenerator dataGen = new DataGenerator();
List<String> inserts = convertToStringList(dataGen.generateInserts(10));
Dataset<Row> df = spark.read().json(jsc.parallelize(inserts, 2));
df.write().format("org.apache.hudi").
options(getQuickstartWriteConfigs()).
option(TABLE_NAME, tableName).
mode(Overwrite).
save(basePath);
Dataset<Row> roViewDF = spark.read().format("org.apache.hudi").load(basePath + "/*/*/*");
roViewDF.registerTempTable("hudi_ro_table");
spark.sql("select * from hudi_ro_table").show(false);
spark.stop();
}
}
即先写入OSS,下图可以看到OSS的Bucket中已经成功写入了数据,然后再通过spark查询写入的结果。
三、数据写入Minio代码
1、pom.xml的依赖
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<!--MINIO-->
<dependency>
<groupId>io.minio</groupId>
<artifactId>minio</artifactId>
<version>8.3.4</version>
</dependency>
<dependency>
<groupId>com.squareup.okhttp3</groupId>
<artifactId>okhttp</artifactId>
<version>4.9.3</version>
</dependency>
2、minio上传数据
public void init(){
minioClient = MinioClient.builder().endpoint(endpoint).credentials(accessKey, secretKey).build();
}
@Test
public void upload(){
File file = new File("/Users/shenyunsese/Desktop/pic3.png");
String objectName="test/pic3.png";
try {
FileInputStream fileInputStream=new FileInputStream(file);
minioClient.putObject(PutObjectArgs.builder().bucket(bucket)
.object(objectName)
.contentType("image/png")
.stream(fileInputStream, fileInputStream.available(), -1).build());
}catch (Exception e){
e.printStackTrace();
}
System.out.println("finished");
}
3、minio下载数据
@Test
public void download(){
String objectName="test/pic3.png";
String fileName="/Users/shenyunsese/Desktop/download2.png";
try {
StatObjectResponse response = minioClient.statObject(
StatObjectArgs.builder().bucket(bucket).object(objectName).build()
);
if (response != null) {
minioClient.downloadObject(DownloadObjectArgs.builder()
.bucket(bucket)
.object(objectName)
.filename(fileName)
.build());
}
}catch (Exception e){
e.printStackTrace();
}
System.out.println("finished");
}
四、源码分析
4.1 Hadoop-aliyun:3.2.1.jar
1、AliyunCredentialsProvider
package org.apache.hadoop.fs.aliyun.oss;
import com.aliyun.oss.common.auth.Credentials;
import com.aliyun.oss.common.auth.CredentialsProvider;
import com.aliyun.oss.common.auth.DefaultCredentials;
import com.aliyun.oss.common.auth.InvalidCredentialsException;
import java.io.IOException;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
## 阿里云权限校验
public class AliyunCredentialsProvider implements CredentialsProvider {
private Credentials credentials = null;
public AliyunCredentialsProvider(Configuration conf) throws IOException {
String accessKeyId;
String accessKeySecret;
try {
## 获取XML文件accessKeyId
accessKeyId = AliyunOSSUtils.getValueWithKey(conf, "fs.oss.accessKeyId");
## 获取XML文件accessKeySecret
accessKeySecret = AliyunOSSUtils.getValueWithKey(conf, "fs.oss.accessKeySecret");
} catch (IOException var7) {
throw new InvalidCredentialsException(var7);
}
String securityToken;
try {
## 获取XML文件securityToken
securityToken = AliyunOSSUtils.getValueWithKey(conf, "fs.oss.securityToken");
} catch (IOException var6) {
securityToken = null;
}
if (!StringUtils.isEmpty(accessKeyId) && !StringUtils.isEmpty(accessKeySecret)) {
if (StringUtils.isNotEmpty(securityToken)) {
## 生成默认权限校验对象
this.credentials = new DefaultCredentials(accessKeyId, accessKeySecret, securityToken);
} else {
this.credentials = new DefaultCredentials(accessKeyId, accessKeySecret);
}
} else {
throw new InvalidCredentialsException("AccessKeyId and AccessKeySecret should not be null or empty.");
}
}
public void setCredentials(Credentials creds) {
if (creds == null) {
throw new InvalidCredentialsException("Credentials should not be null.");
} else {
this.credentials = creds;
}
}
public Credentials getCredentials() {
if (this.credentials == null) {
throw new InvalidCredentialsException("Invalid credentials");
} else {
return this.credentials;
}
}
}
2、AliyunOSSBlockOutputStream
package org.apache.hadoop.fs.aliyun.oss;
import com.aliyun.oss.model.PartETag;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import com.google.common.util.concurrent.ListeningExecutorService;
import com.google.common.util.concurrent.MoreExecutors;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import org.apache.hadoop.conf.Configuration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
## 阿里云OSS块输出流
public class AliyunOSSBlockOutputStream extends OutputStream {
private static final Logger LOG = LoggerFactory.getLogger(AliyunOSSBlockOutputStream.class);
private AliyunOSSFileSystemStore store;
private Configuration conf;
private boolean closed;
private String key;
## 文件块
private File blockFile;
private Map<Integer, File> blockFiles = new HashMap();
private long blockSize;
private int blockId = 0;
private long blockWritten = 0L;
private String uploadId = null;
private final List<ListenableFuture<PartETag>> partETagsFutures;
private final ListeningExecutorService executorService;
## 输出流
private OutputStream blockStream;
private final byte[] singleByte = new byte[1];
public AliyunOSSBlockOutputStream(Configuration conf, AliyunOSSFileSystemStore store, String key, Long blockSize, ExecutorService executorService) throws IOException {
this.store = store;
this.conf = conf;
this.key = key;
this.blockSize = blockSize;
this.blockFile = this.newBlockFile();
this.blockStream = new BufferedOutputStream(new FileOutputStream(this.blockFile));
this.partETagsFutures = new ArrayList(2);
this.executorService = MoreExecutors.listeningDecorator(executorService);
}
private File newBlockFile() throws IOException {
## 创建阿里云临时文件写入——>阿里云OSS工具类
return AliyunOSSUtils.createTmpFileForWrite(String.format("oss-block-%04d-", this.blockId), this.blockSize, this.conf);
}
public synchronized void flush() throws IOException {
this.blockStream.flush();
}
public synchronized void close() throws IOException {
if (!this.closed) {
this.blockStream.flush();
this.blockStream.close();
## 如果块文件Map不包含该块文件,将块文件写入Map最后
if (!this.blockFiles.values().contains(this.blockFile)) {
++this.blockId;
this.blockFiles.put(this.blockId, this.blockFile);
}
try {
if (this.blockFiles.size() == 1) {
# 如果只存在一个文件直接上传对象
this.store.uploadObject(this.key, this.blockFile);
} else {
if (this.blockWritten > 0L) {
## 使用ListenableFuture Guava帮我们检测Future是否完成了,如果完成就自动调用回调函数,这样可以减少并发程序的复杂度。
ListenableFuture<PartETag> partETagFuture = this.executorService.submit(() -> {
## 上传对象
PartETag partETag = this.store.uploadPart(this.blockFile, this.key, this.uploadId, this.blockId);
return partETag;
});
## 将已完成的Future加入集合
this.partETagsFutures.add(partETagFuture);
}
## 等待所有部分都上传完
List<PartETag> partETags = this.waitForAllPartUploads();
if (null == partETags) {
throw new IOException("Failed to multipart upload to oss, abort it.");
}
## 完成文件上传
this.store.completeMultipartUpload(this.key, this.uploadId, new ArrayList(partETags));
}
} finally {
## 移除临时文件
this.removeTemporaryFiles();
this.closed = true;
}
}
}
## 写入(可复用)
public synchronized void write(int b) throws IOException {
this.singleByte[0] = (byte)b;
this.write(this.singleByte, 0, 1);
}
## 写入(可复用)
public synchronized void write(byte[] b, int off, int len) throws IOException {
if (this.closed) {
throw new IOException("Stream closed.");
} else {
this.blockStream.write(b, off, len);
this.blockWritten += (long)len;
if (this.blockWritten >= this.blockSize) {
this.uploadCurrentPart();
this.blockWritten = 0L;
}
}
}
## 移除临时文件(可复用)
private void removeTemporaryFiles() {
Iterator var1 = this.blockFiles.values().iterator();
while(var1.hasNext()) {
File file = (File)var1.next();
if (file != null && file.exists() && !file.delete()) {
LOG.warn("Failed to delete temporary file {}", file);
}
}
}
## 移除块存储部分文件(可复用)
private void removePartFiles() throws IOException {
Iterator var1 = this.partETagsFutures.iterator();
while(var1.hasNext()) {
ListenableFuture<PartETag> partETagFuture = (ListenableFuture)var1.next();
if (partETagFuture.isDone()) {
try {
File blockFile = (File)this.blockFiles.get(((PartETag)partETagFuture.get()).getPartNumber());
if (blockFile != null && blockFile.exists() && !blockFile.delete()) {
LOG.warn("Failed to delete temporary file {}", blockFile);
}
} catch (ExecutionException | InterruptedException var4) {
throw new IOException(var4);
}
}
}
}
## 上传当前部分块文件(可复用)
private void uploadCurrentPart() throws IOException {
this.blockStream.flush();
this.blockStream.close();
if (this.blockId == 0) {
this.uploadId = this.store.getUploadId(this.key);
}
++this.blockId;
this.blockFiles.put(this.blockId, this.blockFile);
File currentFile = this.blockFile;
int currentBlockId = this.blockId;
ListenableFuture<PartETag> partETagFuture = this.executorService.submit(() -> {
PartETag partETag = this.store.uploadPart(currentFile, this.key, this.uploadId, currentBlockId);
return partETag;
});
this.partETagsFutures.add(partETagFuture);
this.removePartFiles();
this.blockFile = this.newBlockFile();
this.blockStream = new BufferedOutputStream(new FileOutputStream(this.blockFile));
}
## 等待所有块文件上传完毕(可复用)
private List<PartETag> waitForAllPartUploads() throws IOException {
LOG.debug("Waiting for {} uploads to complete", this.partETagsFutures.size());
try {
return (List)Futures.allAsList(this.partETagsFutures).get();
} catch (InterruptedException var4) {
LOG.warn("Interrupted partUpload", var4);
Thread.currentThread().interrupt();
return null;
} catch (ExecutionException var5) {
LOG.debug("While waiting for upload completion", var5);
LOG.debug("Cancelling futures");
Iterator var2 = this.partETagsFutures.iterator();
while(var2.hasNext()) {
ListenableFuture<PartETag> future = (ListenableFuture)var2.next();
future.cancel(true);
}
this.store.abortMultipartUpload(this.key, this.uploadId);
throw new IOException("Multi-part upload with id '" + this.uploadId + "' to " + this.key, var5);
}
}
}
3、AliyunOSSCopyFileContext
package org.apache.hadoop.fs.aliyun.oss;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
## 阿里云复制文件内容
public class AliyunOSSCopyFileContext {
private final ReentrantLock lock = new ReentrantLock();
private Condition readyCondition;
private boolean copyFailure;
private int copiesFinish;
public AliyunOSSCopyFileContext() {
this.readyCondition = this.lock.newCondition();
this.copyFailure = false;
this.copiesFinish = 0;
}
public void lock() {
this.lock.lock();
}
public void unlock() {
this.lock.unlock();
}
public void awaitAllFinish(int copiesToFinish) throws InterruptedException {
while(this.copiesFinish != copiesToFinish) {
this.readyCondition.await();
}
}
public void signalAll() {
this.readyCondition.signalAll();
}
public boolean isCopyFailure() {
return this.copyFailure;
}
public void setCopyFailure(boolean copyFailure) {
this.copyFailure = copyFailure;
}
public void incCopiesFinish() {
++this.copiesFinish;
}
}
4、AliyunOSSCopyFileTask
package org.apache.hadoop.fs.aliyun.oss;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
## 阿里云复制文件任务
public class AliyunOSSCopyFileTask implements Runnable {
public static final Logger LOG = LoggerFactory.getLogger(AliyunOSSCopyFileTask.class);
private AliyunOSSFileSystemStore store;
private String srcKey;
private long srcLen;
private String dstKey;
private AliyunOSSCopyFileContext copyFileContext;
public AliyunOSSCopyFileTask(AliyunOSSFileSystemStore store, String srcKey, long srcLen, String dstKey, AliyunOSSCopyFileContext copyFileContext) {
this.store = store;
this.srcKey = srcKey;
this.srcLen = srcLen;
this.dstKey = dstKey;
this.copyFileContext = copyFileContext;
}
public void run() {
boolean fail = false;
try {
fail = !this.store.copyFile(this.srcKey, this.srcLen, this.dstKey);
} catch (Exception var6) {
LOG.warn("Exception thrown when copy from " + this.srcKey + " to " + this.dstKey + ", exception: " + var6);
fail = true;
} finally {
this.copyFileContext.lock();
if (fail) {
this.copyFileContext.setCopyFailure(fail);
}
this.copyFileContext.incCopiesFinish();
this.copyFileContext.signalAll();
this.copyFileContext.unlock();
}
}
}
5、AliyunOSSFileReaderTask
package org.apache.hadoop.fs.aliyun.oss;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.fs.aliyun.oss.ReadBuffer.STATUS;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.retry.RetryPolicies;
import org.apache.hadoop.io.retry.RetryPolicy;
import org.apache.hadoop.io.retry.RetryPolicy.RetryAction.RetryDecision;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
## 阿里云文件读取任务
public class AliyunOSSFileReaderTask implements Runnable {
public static final Logger LOG = LoggerFactory.getLogger(AliyunOSSFileReaderTask.class);
private String key;
private AliyunOSSFileSystemStore store;
private ReadBuffer readBuffer;
private static final int MAX_RETRIES = 3;
private RetryPolicy retryPolicy;
public AliyunOSSFileReaderTask(String key, AliyunOSSFileSystemStore store, ReadBuffer readBuffer) {
this.key = key;
this.store = store;
this.readBuffer = readBuffer;
RetryPolicy defaultPolicy = RetryPolicies.retryUpToMaximumCountWithFixedSleep(3, 3L, TimeUnit.SECONDS);
Map<Class<? extends Exception>, RetryPolicy> policies = new HashMap();
policies.put(IOException.class, defaultPolicy);
policies.put(IndexOutOfBoundsException.class, RetryPolicies.TRY_ONCE_THEN_FAIL);
policies.put(NullPointerException.class, RetryPolicies.TRY_ONCE_THEN_FAIL);
this.retryPolicy = RetryPolicies.retryByException(defaultPolicy, policies);
}
public void run() {
int retries = 0;
this.readBuffer.lock();
try {
while(true) {
try {
InputStream in = this.store.retrieve(this.key, this.readBuffer.getByteStart(), this.readBuffer.getByteEnd());
Throwable var29 = null;
try {
IOUtils.readFully(in, this.readBuffer.getBuffer(), 0, this.readBuffer.getBuffer().length);
this.readBuffer.setStatus(STATUS.SUCCESS);
break;
} catch (Throwable var22) {
var29 = var22;
throw var22;
} finally {
if (in != null) {
if (var29 != null) {
try {
in.close();
} catch (Throwable var23) {
var29.addSuppressed(var23);
}
} else {
in.close();
}
}
}
} catch (Exception var26) {
Exception e = var26;
LOG.warn("Exception thrown when retrieve key: " + this.key + ", exception: " + var26);
try {
RetryPolicy.RetryAction rc = this.retryPolicy.shouldRetry(e, retries++, 0, true);
if (rc.action != RetryDecision.RETRY) {
break;
}
Thread.sleep(rc.delayMillis);
} catch (Exception var25) {
LOG.warn("Exception thrown when call shouldRetry, exception " + var25);
break;
}
}
}
if (this.readBuffer.getStatus() != STATUS.SUCCESS) {
this.readBuffer.setStatus(STATUS.ERROR);
}
this.readBuffer.signalAll();
} finally {
this.readBuffer.unlock();
}
}
}