一:pom依赖
<repositories>
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>org.apache.kudu</groupId>
<artifactId>kudu-client-tools</artifactId>
<version>1.6.0-cdh5.14.0</version>
</dependency>
<dependency>
<groupId>org.apache.kudu</groupId>
<artifactId>kudu-client</artifactId>
<version>1.6.0-cdh5.14.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.kudu/kudu-spark2 -->
<dependency>
<groupId>org.apache.kudu</groupId>
<artifactId>kudu-spark2_2.11</artifactId>
<version>1.6.0-cdh5.14.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.1.0</version>
</dependency>
</dependencies>
二:示例代码
//Kudu-Spark
class kudu_spark {
//创建SparkSession对象
val spark: SparkSession = SparkSession.builder().appName("kudu").master("local[6]").getOrCreate()
import spark.implicits._
//创建sparkContext对象,因为在创建kuduContext的时候需要的参数
private val sc: SparkContext = spark.sparkContext
spark.sparkContext.setLogLevel("warn")
//创建Kudu集群master
val MASTER = "node01:7051,node02:7051,node03:7051"
//创建KuduContext对象
val kuduContext = new KuduContext(MASTER,sc)
//表名
val TABLENAME = "kudu_spark01"
//schema
val schema = StructType(
StructField("id",IntegerType,false)::
StructField("name",StringType,false)::
StructField("age",IntegerType,false)::Nil)
//key,数组类型
val key = Seq("id")
//options
val options = new CreateTableOptions
//给options添加分区,根据key进行分区,这个分区类型时List,是java类型的,所以需要把Seq()转换成java的list类型
val list: util.List[String] = scala.collection.JavaConversions.seqAsJavaList(key)
//根据key分区,一共三个分区
options.addHashPartitions(list,3)
//设置副本数
options.setNumReplicas(2)
//1.创建表
@Test
def createTable(): Unit ={
//键表参数: 表名,scheme,key,options
//不存在就创建该表
if (!kuduContext.tableExists(TABLENAME)){
kuduContext.createTable(TABLENAME,schema,key,options)
}
}
//2.添加数据
@Test
def insertData2Kudu(): Unit ={
val df: DataFrame = Seq((1,"aa",20),(2,"bb",30),(3,"cc",40)).toDF("id","name","age")
//两个参数,一个是DataFrame 一个是表名
kuduContext.upsertRows(df,TABLENAME)
}
//3.查询数据
@Test
def queryData(): Unit ={
val col = Seq("id","name","age")
val rdd = kuduContext.kuduRDD(sc,TABLENAME,col)
rdd.foreach(println(_))
}
//4.删除数据
@Test
def deleteData(): Unit ={
val df: DataFrame = Seq(3).toDF("id")
//删除指定列数据
kuduContext.deleteRows(df,TABLENAME)
}
//5.修改数据
@Test
def updateData(): Unit ={
val df: DataFrame = Seq((100,"zz",66),(1,"update",66)).toDF("id","name","age")
//upsertRows() 存在就修改,不存在就插入数据
//updateRows() 存在就修改,不存在就报错
kuduContext.upsertRows(df,TABLENAME)
}
//6.删除表
@Test
def dropTable(): Unit ={
kuduContext.deleteTable(TABLENAME)
}
//7.使用Spark的API读写Kudu的数据
@Test
def test(): Unit ={
//1.读取数据
// 想要使用.kudu 就必须导入函数
import org.apache.kudu.spark.kudu._
val read = spark.read
.option("kudu.master", MASTER)
.option("kudu.table", TABLENAME)
.kudu
//2.写数据
//为了避免无法写入,需要重写定义一个表名,并且需要把表格先创建好
val tableName = "spark01"
kuduContext.createTable(tableName,schema,key,options)
//注意: sql写入只支持 Append模式
read.write.mode(SaveMode.Append)
.option("kudu.master",MASTER)
.option("kudu.table",tableName)
.kudu
}
}
注意:
1.导入kudu的函数
import org.apache.kudu.spark.kudu._
2.主键key是数组类型,需要转成JAVA的类型
val key = Seq(“id”)
val list: util.List[String] = scala.collection.JavaConversions.seqAsJavaList(key)
3.options需要设置的是分区字段和副本数