导入Maven坐标
<dependency>
<groupId>org.apache.kudu</groupId>
<artifactId>kudu-client</artifactId>
<version>1.9.0</version>
</dependency>
<dependency>
<groupId>org.apache.kudu</groupId>
<artifactId>kudu-spark2_2.11</artifactId>
<version>1.9.0</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.11.12</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.4.5</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.4.5</version>
</dependency>
使用RDD对表CD操作
import org.apache.kudu.client.CreateTableOptions
import org.apache.kudu.spark.kudu.KuduContext
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
object TableRDDOperating {
def createKuduTable(tableName: String, kuduContext: KuduContext): Unit = {
val schema: StructType = StructType(
Seq(
StructField("id", IntegerType, nullable = false),
StructField("name", StringType, nullable = true),
StructField("age", IntegerType, nullable = true),
StructField("gender", StringType, nullable = true)
)
)
val keys: Seq[String] = Seq("id")
val options: CreateTableOptions = new CreateTableOptions()
import scala.collection.JavaConversions._
options.addHashPartitions(keys, 3)
options.setNumReplicas(1)
val kuduTable = kuduContext.createTable(tableName, schema, keys, options)
println(s"TableId: ${kuduTable.getTableId}")
}
def dropKuduTable(tableName: String, kuduContext: KuduContext): Unit = {
if (kuduContext.tableExists(tableName)) {
kuduContext.deleteTable(tableName)
}
}
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.appName(this.getClass.getSimpleName.stripSuffix("$"))
.master("local[2]")
.config("spark.sql.shuffle.partitions", "2")
.getOrCreate()
val kuduContext: KuduContext = new KuduContext("node2:7051", spark.sparkContext)
println(s"KuduContext: ${kuduContext}")
createKuduTable("kudu_users", kuduContext)
dropKuduTable("kudu_users", kuduContext)
spark.stop()
}
}
使用RDD对数据CRUD操作
import org.apache.kudu.spark.kudu.KuduContext
import org.apache.spark.TaskContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
object DataRDDOperating {
def insertData(spark: SparkSession, kuduContext: KuduContext, tableName: String): Unit = {
val usersDF: DataFrame = spark.createDataFrame(
Seq(
(1001, "zhangsan", 23, "男"),
(1002, "lisi", 22, "男"),
(1003, "xiaohong", 24, "女"),
(1004, "zhaoliu2", 33, "男")
)
).toDF("id", "name", "age", "gender")
kuduContext.insertRows(usersDF, tableName)
}
def selectData(spark: SparkSession, kuduContext: KuduContext, tableName: String): Unit = {
val kuduRDD: RDD[Row] = kuduContext.kuduRDD(
spark.sparkContext,
tableName,
columnProjection = Seq("id", "name", "age", "gender")
)
kuduRDD.foreachPartition { iter =>
println(s"partitionId = ${TaskContext.getPartitionId()}")
iter.foreach { row => println(row.toString()) }
}
}
def updateData(spark: SparkSession, context: KuduContext, tableName: String): Unit = {
val usersDF: DataFrame = spark.createDataFrame(
Seq((1001, "xqh", 21, "男"))
).toDF("id", "name", "age", "gender")
context.updateRows(usersDF, tableName)
}
def upsertData(spark: SparkSession, context: KuduContext, tableName: String): Unit = {
val usersDF: DataFrame = spark.createDataFrame(
Seq(
(1001, "zhangsanfeng", 24, "男"),
(1005, "tianqi", 33, "男")
)
).toDF("id", "name", "age", "gender")
context.upsertRows(usersDF, tableName)
}
def deleteData(spark: SparkSession, context: KuduContext, tableName: String): Unit = {
import spark.implicits._
val usersDF: DataFrame = spark.sparkContext.parallelize(Seq(1001)).toDF("id")
context.deleteRows(usersDF, tableName)
}
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.appName(this.getClass.getSimpleName.stripSuffix("$"))
.master("local[2]")
.config("spark.sql.shuffle.partitions", "2")
.getOrCreate()
val kuduContext: KuduContext = new KuduContext("node2:7051", spark.sparkContext)
val tableName = "kudu_users"
insertData(spark, kuduContext, tableName)
selectData(spark, kuduContext, tableName)
updateData(spark, kuduContext, tableName)
upsertData(spark, kuduContext, tableName)
deleteData(spark, kuduContext, tableName)
spark.stop()
}
}
使用SQL对Kudu操作
object KuduSparkSQLOperation {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.appName(this.getClass.getSimpleName.stripSuffix("$"))
.master("local[2]")
.config("spark.sql.shuffle.partitions", "2")
.getOrCreate()
import spark.implicits._
val kuduDF: DataFrame = spark.read
.format("kudu")
.option("kudu.master", "node2:7051")
.option("kudu.table", "kudu_users")
.load()
val gender_udf = udf(
(gender: String) => {
gender match {
case "男" => "M"
case "女" => "F"
case _ => "X"
}
}
)
val etlDF: DataFrame = kuduDF.select(
$"id", $"name", $"age",
gender_udf($"gender").as("gender")
)
etlDF
.coalesce(1)
.write
.mode(SaveMode.Append)
.format("kudu")
.option("kudu.master", "node2:7051")
.option("kudu.table", "kudu_users")
.option("kudu.operation", "upsert")
.save()
spark.stop()
}
}