Hudi-集成Spark之code方式

Hudi集成Spark之code方式

环境准备

创建Maven工程,pom文件:

<properties>
    <scala.version>2.12.10</scala.version>
    <scala.binary.version>2.12</scala.binary.version>
    <spark.version>3.2.2</spark.version>
    <hadoop.version>3.1.3</hadoop.version>
    <hudi.version>0.12.0</hudi.version>
    <maven.compiler.source>8</maven.compiler.source>
    <maven.compiler.target>8</maven.compiler.target>
</properties>

<dependencies>
    <!-- 依赖Scala语言 -->
    <dependency>
        <groupId>org.scala-lang</groupId>
        <artifactId>scala-library</artifactId>
        <version>${scala.version}</version>
    </dependency>
    <!-- Spark Core 依赖 -->
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-core_${scala.binary.version}</artifactId>
        <version>${spark.version}</version>
        <scope>provided</scope>
    </dependency>
    <!-- Spark SQL 依赖 -->
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-sql_${scala.binary.version}</artifactId>
        <version>${spark.version}</version>
        <scope>provided</scope>
    </dependency>

    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-hive_${scala.binary.version}</artifactId>
        <version>${spark.version}</version>
        <scope>provided</scope>
    </dependency>

    <!-- Hadoop Client 依赖 -->
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-client</artifactId>
        <version>${hadoop.version}</version>
        <scope>provided</scope>
    </dependency>

    <!-- hudi-spark3.2 -->
    <dependency>
        <groupId>org.apache.hudi</groupId>
        <artifactId>hudi-spark3.2-bundle_${scala.binary.version}</artifactId>
        <version>${hudi.version}</version>
        <scope>provided</scope>
    </dependency>

    <dependency>
        <groupId>org.apache.kafka</groupId>
        <artifactId>kafka-clients</artifactId>
        <version>2.4.1</version>
    </dependency>

    <!--fastjson <= 1.2.80 存在安全漏洞,-->
    <dependency>
        <groupId>com.alibaba</groupId>
        <artifactId>fastjson</artifactId>
        <version>1.2.83</version>
    </dependency>

</dependencies>

<build>
    <plugins>
        <!-- assembly打包插件 -->
        <plugin>
            <groupId>org.apache.maven.plugins</groupId>
            <artifactId>maven-assembly-plugin</artifactId>
            <version>3.0.0</version>
            <executions>
                <execution>
                    <id>make-assembly</id>
                    <phase>package</phase>
                    <goals>
                        <goal>single</goal>
                    </goals>
                </execution>
            </executions>
            <configuration>
                <archive>
                    <manifest>
                    </manifest>
                </archive>
                <descriptorRefs>
                    <descriptorRef>jar-with-dependencies</descriptorRef>
                </descriptorRefs>
            </configuration>
        </plugin>

        <!--Maven编译scala所需依赖-->
        <plugin>
            <groupId>net.alchim31.maven</groupId>
            <artifactId>scala-maven-plugin</artifactId>
            <version>3.2.2</version>
            <executions>
                <execution>
                    <goals>
                        <goal>compile</goal>
                        <goal>testCompile</goal>
                    </goals>
                </execution>
            </executions>
        </plugin>
    </plugins>
</build>

插入数据

import org.apache.hudi.QuickstartUtils._
import org.apache.spark.SparkConf
import org.apache.spark.sql._
import scala.collection.JavaConversions._
import org.apache.spark.sql.SaveMode._
import org.apache.hudi.DataSourceWriteOptions._
import org.apache.hudi.config.HoodieWriteConfig._

object InsertDemo {
    def main(args: Array[String]): Unit = {
        // 创建 SparkSession
        val sparkConf = new SparkConf()
        .setAppName(this.getClass.getSimpleName)
        .setMaster("local[*]")
        .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
        val sparkSession = SparkSession.builder()
        .config(sparkConf)
        .enableHiveSupport()
        .getOrCreate()

        val tableName = "hudi_trips_cow"
        val basePath = "hdfs://hadoop1:8020/tmp/hudi_trips_cow"
        val dataGen = new DataGenerator

        val inserts = convertToStringList(dataGen.generateInserts(10))
        val df = sparkSession.read.json(sparkSession.sparkContext.parallelize(inserts, 2))
        df.write.format("hudi").
        options(getQuickstartWriteConfigs).
        option(PRECOMBINE_FIELD.key(), "ts").
        option(RECORDKEY_FIELD.key(), "uuid").
        option(PARTITIONPATH_FIELD.key(), "partitionpath").
        option(TBL_NAME.key(), tableName).
        mode(Overwrite).
        save(basePath)

        // 应用结束,关闭资源
        sparkSession.stop()
    }
}

查询数据

import org.apache.spark.SparkConf
import org.apache.spark.sql._

object QueryDemo {
    def main(args: Array[String]): Unit = {
        // 创建 SparkSession
        val sparkConf = new SparkConf()
            .setAppName(this.getClass.getSimpleName)
            //      .setMaster("local[*]")
            .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
        val sparkSession = SparkSession.builder()
            .config(sparkConf)
            .enableHiveSupport()
            .getOrCreate()

        val basePath = "hdfs://hadoop1:8020/tmp/hudi_trips_cow"

        val tripsSnapshotDF = sparkSession.
            read.
            format("hudi").
            load(basePath)

        //    时间旅行查询写法一
        //    sparkSession.read.
        //      format("hudi").
        //      option("as.of.instant", "20210728141108100").
        //      load(basePath)
        //
        //    时间旅行查询写法二
        //    sparkSession.read.
        //      format("hudi").
        //      option("as.of.instant", "2021-07-28 14:11:08.200").
        //      load(basePath)
        //
        //    时间旅行查询写法三:等价于"as.of.instant = 2021-07-28 00:00:00"
        //    sparkSession.read.
        //      format("hudi").
        //      option("as.of.instant", "2021-07-28").
        //      load(basePath)

        tripsSnapshotDF.createOrReplaceTempView("hudi_trips_snapshot")

        sparkSession
            .sql("select fare, begin_lon, begin_lat, ts from  hudi_trips_snapshot where fare > 20.0")
            .show()
    }
}

更新数据

import org.apache.hudi.QuickstartUtils._
import org.apache.spark.SparkConf
import org.apache.spark.sql._
import scala.collection.JavaConversions._
import org.apache.spark.sql.SaveMode._
import org.apache.hudi.DataSourceWriteOptions._
import org.apache.hudi.config.HoodieWriteConfig._

object UpdateDemo {
    def main(args: Array[String]): Unit = {
        // 创建 SparkSession
        val sparkConf = new SparkConf()
        .setAppName(this.getClass.getSimpleName)
        .setMaster("local[*]")
        .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
        val sparkSession = SparkSession.builder()
        .config(sparkConf)
        .enableHiveSupport()
        .getOrCreate()

        val tableName = "hudi_trips_cow"
        val basePath = "hdfs://hadoop1:8020/tmp/hudi_trips_cow"

        val dataGen = new DataGenerator
        val updates = convertToStringList(dataGen.generateUpdates(10))
        val df = sparkSession.read.json(sparkSession.sparkContext.parallelize(updates, 2))
        df.write.format("hudi").
        options(getQuickstartWriteConfigs).
        option(PRECOMBINE_FIELD.key(), "ts").
        option(RECORDKEY_FIELD.key(), "uuid").
        option(PARTITIONPATH_FIELD.key(), "partitionpath").
        option(TBL_NAME.key(), tableName).
        mode(Append).
        save(basePath)


        //    val tripsSnapshotDF = sparkSession.
        //      read.
        //      format("hudi").
        //      load(basePath)
        //    tripsSnapshotDF.createOrReplaceTempView("hudi_trips_snapshot")
        //
        //    sparkSession
        //      .sql("select fare, begin_lon, begin_lat, ts from  hudi_trips_snapshot where fare > 20.0")
        //      .show()
    }
}

指定时间点查询

import org.apache.hudi.DataSourceReadOptions._
import org.apache.spark.SparkConf
import org.apache.spark.sql._

object PointInTimeQueryDemo {
    def main(args: Array[String]): Unit = {
        // 创建 SparkSession
        val sparkConf = new SparkConf()
        .setAppName(this.getClass.getSimpleName)
        .setMaster("local[*]")
        .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
        val sparkSession = SparkSession.builder()
        .config(sparkConf)
        .enableHiveSupport()
        .getOrCreate()

        val basePath = "hdfs://hadoop1:8020/tmp/hudi_trips_cow"

        import sparkSession.implicits._
        val commits = sparkSession.sql("select distinct(_hoodie_commit_time) as commitTime from  hudi_trips_snapshot order by commitTime").map(k => k.getString(0)).take(50)
        val beginTime = "000"
        val endTime = commits(commits.length - 2)

        val tripsIncrementalDF = sparkSession.read.format("hudi").
        option(QUERY_TYPE.key(), QUERY_TYPE_INCREMENTAL_OPT_VAL).
        option(BEGIN_INSTANTTIME.key(), beginTime).
        option(END_INSTANTTIME.key(), endTime).
        load(basePath)

        tripsIncrementalDF.createOrReplaceTempView("hudi_trips_point_in_time")

        sparkSession.
        sql("select `_hoodie_commit_time`, fare, begin_lon, begin_lat, ts from hudi_trips_point_in_time where fare > 20.0").
        show()
    }
}

增量查询

import org.apache.hudi.DataSourceReadOptions._
import org.apache.spark.SparkConf
import org.apache.spark.sql._
 
object IncrementalQueryDemo {
    def main(args: Array[String]): Unit = {
        // 创建 SparkSession
        val sparkConf = new SparkConf()
            .setAppName(this.getClass.getSimpleName)
            .setMaster("local[*]")
            .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
        val sparkSession = SparkSession.builder()
            .config(sparkConf)
            .enableHiveSupport()
            .getOrCreate()
 
        val basePath = "hdfs://hadoop1:8020/tmp/hudi_trips_cow"
 
        import sparkSession.implicits._
        val commits = sparkSession.sql("select distinct(_hoodie_commit_time) as commitTime from  hudi_trips_snapshot order by commitTime").map(k => k.getString(0)).take(50)
        val beginTime = commits(commits.length - 2)
 
        val tripsIncrementalDF = sparkSession.read.format("hudi").
            option(QUERY_TYPE.key(), QUERY_TYPE_INCREMENTAL_OPT_VAL).
            option(BEGIN_INSTANTTIME.key(), beginTime).
            load(basePath)
 
        tripsIncrementalDF.createOrReplaceTempView("hudi_trips_incremental")
 
        sparkSession.sql("select `_hoodie_commit_time`, fare, begin_lon, begin_lat, ts from  hudi_trips_incremental where fare > 20.0").show()
 
    }

}

删除数据

import org.apache.hudi.DataSourceWriteOptions._
import org.apache.hudi.QuickstartUtils._
import org.apache.hudi.config.HoodieWriteConfig._
import org.apache.spark.SparkConf
import org.apache.spark.sql.SaveMode._
import org.apache.spark.sql._

import scala.collection.JavaConversions._

object DeleteDemo {
    def main(args: Array[String]): Unit = {
        // 创建 SparkSession
        val sparkConf = new SparkConf()
        .setAppName(this.getClass.getSimpleName)
        .setMaster("local[*]")
        .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
        val sparkSession = SparkSession.builder()
        .config(sparkConf)
        .enableHiveSupport()
        .getOrCreate()

        val tableName = "hudi_trips_cow"
        val basePath = "hdfs://hadoop1:8020/tmp/hudi_trips_cow"
        val dataGen = new DataGenerator

        sparkSession.
        read.
        format("hudi").
        load(basePath).
        createOrReplaceTempView("hudi_trips_snapshot")

        sparkSession.sql("select uuid, partitionpath from hudi_trips_snapshot").count()

        val ds = sparkSession.sql("select uuid, partitionpath from hudi_trips_snapshot").limit(2)

        val deletes = dataGen.generateDeletes(ds.collectAsList())
        val df = sparkSession.read.json(sparkSession.sparkContext.parallelize(deletes, 2))

        df.write.format("hudi").
        options(getQuickstartWriteConfigs).
        option(OPERATION.key(), "delete").
        option(PRECOMBINE_FIELD.key(), "ts").
        option(RECORDKEY_FIELD.key(), "uuid").
        option(PARTITIONPATH_FIELD.key(), "partitionpath").
        option(TBL_NAME.key(), tableName).
        mode(Append).
        save(basePath)

        val roAfterDeleteViewDF = sparkSession.
        read.
        format("hudi").
        load(basePath)

        roAfterDeleteViewDF.createOrReplaceTempView("hudi_trips_snapshot")

        // 返回的总行数应该比原来少2行
        sparkSession.sql("select uuid, partitionpath from hudi_trips_snapshot").count()   
    }
}

覆盖数据

import org.apache.hudi.DataSourceWriteOptions._
import org.apache.hudi.QuickstartUtils._
import org.apache.hudi.config.HoodieWriteConfig._
import org.apache.spark.SparkConf
import org.apache.spark.sql.SaveMode._
import org.apache.spark.sql._
 
import scala.collection.JavaConversions._
 
object InsertOverwriteDemo {
    def main(args: Array[String]): Unit = {
        // 创建 SparkSession
        val sparkConf = new SparkConf()
            .setAppName(this.getClass.getSimpleName)
            .setMaster("local[*]")
            .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
        val sparkSession = SparkSession.builder()
            .config(sparkConf)
            .enableHiveSupport()
            .getOrCreate()
 
        val tableName = "hudi_trips_cow"
        val basePath = "hdfs://hadoop1:8020/tmp/hudi_trips_cow"
        val dataGen = new DataGenerator
 
        sparkSession.
            read.format("hudi").
            load(basePath).
            select("uuid", "partitionpath").
            sort("partitionpath", "uuid").
            show(100, false)
 
 
        val inserts = convertToStringList(dataGen.generateInserts(10))
        val df = sparkSession.read.json(sparkSession.sparkContext.parallelize(inserts, 2)).
            filter("partitionpath = 'americas/united_states/san_francisco'")
 
        df.write.format("hudi").
            options(getQuickstartWriteConfigs).
            option(OPERATION.key(), "insert_overwrite").
            option(PRECOMBINE_FIELD.key(), "ts").
            option(RECORDKEY_FIELD.key(), "uuid").
            option(PARTITIONPATH_FIELD.key(), "partitionpath").
            option(TBL_NAME.key(), tableName).
            mode(Append).
            save(basePath)
 
        sparkSession.
            read.format("hudi").
            load(basePath).
            select("uuid", "partitionpath").
            sort("partitionpath", "uuid").
            show(100, false)
 
    }
}

提交运行

将代码打成jar包,上传到目录myjars,执行提交命令(QueryDemo为例):

spark-submit \
    --class com.atguigu.hudi.spark.QueryDemo \
    /opt/jars/spark-hudi-demo-1.0-SNAPSHOT-jar-with-dependencies.jar
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值