使用spark操作hudi表格【Scala版】

霸气的哦尼酱

于 2024-10-05 21:16:50 发布

阅读量200

点赞数 8

文章标签： spark scala 大数据

本文链接：https://blog.csdn.net/m0_74453787/article/details/142718490

版权

文章目录

- spark集成hudi
- - 读取hudi表
  - 写入hudi表

spark集成hudi

Hudi支持的Spark版本

Hudi	支持spark版本
0.12.x	3.3.x，3.2.x，3.1.x
0.11.x	3.2.x(默认构建)，3.1.x
0.10.x	3.1.x(默认构建)，3.0.x
0.7.0-0.9.0	3.0.x
0.6.0 and prior	不支持

将hudi编译好的包到spark的jars目录，在目录/opt/module/hudi-0.12.0/packaging/hudi-spark-bundle/target

读取hudi表

maven依赖

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.atguigu.hudi</groupId>
    <artifactId>spark-hudi-demo</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <scala.version>2.12.10</scala.version>
        <scala.binary.version>2.12</scala.binary.version>
        <spark.version>3.1.1</spark.version>
        <hadoop.version>3.1.3</hadoop.version>
        <hudi.version>0.12.0</hudi.version>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
    </properties>

    <dependencies>
        <!-- 依赖Scala语言 -->
        <dependency>
            <groupId>org.scala-lang</groupId>
            <artifactId>scala-library</artifactId>
            <version>${scala.version}</version>
        </dependency>
        <!-- Spark Core 依赖 -->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_${scala.binary.version}</artifactId>
            <version>${spark.version}</version>
            <scope>provided</scope>
        </dependency>
        <!-- Spark SQL 依赖 -->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_${scala.binary.version}</artifactId>
            <version>${spark.version}</version>
            <scope>provided</scope>
        </dependency>

        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-hive_${scala.binary.version}</artifactId>
            <version>${spark.version}</version>
            <scope>provided</scope>
        </dependency>

        <!-- Hadoop Client 依赖 -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop.version}</version>
            <scope>provided</scope>
        </dependency>

        <!-- hudi-spark3.1 -->
        <dependency>
            <groupId>org.apache.hudi</groupId>
            <artifactId>hudi-spark3.1-bundle_${scala.binary.version}</artifactId>
            <version>${hudi.version}</version>
            <scope>provided</scope>
        </dependency>

    </dependencies>

    <build>
        <plugins>
            <!-- assembly打包插件 -->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-assembly-plugin</artifactId>
                <version>3.1.0</version>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
                <configuration>
                    <archive>
                        <manifest>
                        </manifest>
                    </archive>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
            </plugin>

            <!--Maven编译scala所需依赖-->
            <plugin>
                <groupId>net.alchim31.maven</groupId>
                <artifactId>scala-maven-plugin</artifactId>
                <version>3.2.2</version>
                <executions>
                    <execution>
                        <goals>
                            <goal>compile</goal>
                            <goal>testCompile</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>

</project>

读取hudi数据

import org.apache.spark.sql.SparkSession

object ReadHudiTable {
  def main(args: Array[String]): Unit = {
      System.setProperty("HADOOP_USER_NAME","root")
      val spark: SparkSession = SparkSession.builder()
        .master("local[*]")
        .appName("task")
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
        .enableHiveSupport()
        .getOrCreate()


     val path =  "/user/hive/warehouse/ods_ds_hudi.db/user_info"
     val hudi_table = spark.read.format("hudi").load(path)

    hudi_table.show()

    spark.close()
  }
}

在这里插入图片描述

写入hudi表

import org.apache.hudi.QuickstartUtils._

import org.apache.spark.sql._
import scala.collection.JavaConversions._
import org.apache.spark.sql.SaveMode._
import org.apache.hudi.DataSourceWriteOptions._
import org.apache.hudi.config.HoodieWriteConfig._


object WriteHudiTable {
  def main( args: Array[String] ): Unit = {
    // 创建 SparkSession
    val spark: SparkSession = SparkSession.builder()
      .master("local[*]")
      .appName("task")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .enableHiveSupport()
      .getOrCreate()

    val tableName = "hudi_trips_cow"
    val basePath = "hdfs://hadoop102:8020/tmp/hudi_trips_cow"

    val dataGen = new DataGenerator

    val inserts = convertToStringList(dataGen.generateInserts(10))
    val df = spark.read.json(spark.sparkContext.parallelize(inserts, 2))
    df.write.format("hudi").
      options(getQuickstartWriteConfigs)
      .option(PRECOMBINE_FIELD.key(), "ts")
      .option(RECORDKEY_FIELD.key(), "uuid")
      .option(PARTITIONPATH_FIELD.key(), "partitionpath")
      .option(TBL_NAME.key(), tableName)
      .mode(Overwrite)
      .save(basePath)

    println("写入完成！")

    spark.close()
  }
}