KUDU 介绍
kudo 和hdfs hbase的区别
kudu安装部署
方式1 docker部署
https://kudu.apache.org/docs/quickstart.html
方式2 rpm部署
KUDU安装包下载地址:
https://archive.cloudera.com/cdh5/redhat/7/x86_64/cdh/5.15.1/RPMS/x86_64/
API操作之插入数据&删除表&数据查询
<dependency>
<groupId>org.apache.kudu</groupId>
<artifactId>kudu-client</artifactId>
<version>1.7.0</version>
</dependency>
package com.imooc.bigdata.chapter07
import java.util
import org.apache.kudu.{ColumnSchema, Schema, Type}
import org.apache.kudu.client._
object KuduAPIApp {
def main(args: Array[String]): Unit = {
val KUDU_MASTERS = "hadoop000"
val client: KuduClient = new KuduClient.KuduClientBuilder(KUDU_MASTERS).build()
val tableName = "ods"
// createTable(client, tableName)
// insertRows(client, tableName)
// deleteTable(client, "ods")
// deleteTable(client, "province_city_stat")
deleteTable(client, "ods_20181007")
deleteTable(client, "province_city_stat_20181007")
deleteTable(client, "app_stat_20181007")
deleteTable(client, "area_stat_20181007")
// deleteTable(client, "region_stat")
// deleteTable(client, "app_stat")
// createTable(client, tableName)
// query(client, tableName)
// println("======")
// alterRow(client, tableName)
// println(".........")
// query(client, tableName)
// val newTableName = "pk_kudu"
// renameTable(client, tableName, newTableName)
client.close()
}
/**
* 修改表名
*/
def renameTable(client: KuduClient, tableName: String, newTableName: String) = {
val options: AlterTableOptions = new AlterTableOptions()
options.renameTable(newTableName)
client.alterTable(tableName, options)
}
/**
* 修改表数据
*/
def alterRow(client: KuduClient, tableName: String) = {
val table: KuduTable = client.openTable(tableName)
val session: KuduSession = client.newSession()
val update: Update = table.newUpdate()
val row: PartialRow = update.getRow
row.addString("word", "pk-10")
row.addInt("cnt", 8888)
session.apply(update)
}
/**
* 查询数据
*/
def query(client: KuduClient, tableName: String) = {
val table: KuduTable = client.openTable(tableName)
val scanner: KuduScanner = client.newScannerBuilder(table).build()
while(scanner.hasMoreRows) {
val iterator: RowResultIterator = scanner.nextRows()
while(iterator.hasNext) {
val result: RowResult = iterator.next()
println(result.getString("word") + " => " + result.getInt("cnt"))
}
}
}
/**
* 删除表
*/
def deleteTable(client: KuduClient, tableName: String) = {
client.deleteTable(tableName)
}
/**
* 插入数据
*
* 作业:自己找资料进行批量插入
*/
def insertRows(client: KuduClient, tableName: String) = {
val table: KuduTable = client.openTable(tableName) // 根据表名获取kudu的表
val session: KuduSession = client.newSession() // JPA Hibernate
for(i<-1 to 10) {
val insert: Insert = table.newInsert()
val row: PartialRow = insert.getRow
row.addString("word",s"pk-$i")
row.addInt("cnt", 100+i)
session.apply(insert)
}
}
/**
* 创建表
*/
def createTable(client: KuduClient, tableName: String): Unit = {
import scala.collection.JavaConverters._
val columns = List(
new ColumnSchema.ColumnSchemaBuilder("word", Type.STRING).key(true).build(),
new ColumnSchema.ColumnSchemaBuilder("cnt", Type.INT32).build()
).asJava
val schema = new Schema(columns)
val options: CreateTableOptions = new CreateTableOptions()
//设置副本数
options.setNumReplicas(1)
val parcols: util.LinkedList[String] = new util.LinkedList[String]()
parcols.add("word")
options.addHashPartitions(parcols,3)
client.createTable(tableName,schema,options)
}
}
SparkSQL 整合 Kudu
<dependency>
<groupId>org.apache.kudu</groupId>
<artifactId>kudu-spark2_2.11</artifactId>
<version>1.7.0</version>
</dependency>
package com.imooc.bigdata.chapter07
import java.util.Properties
import com.typesafe.config.ConfigFactory
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
object SparkKuduApp {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.master("local").getOrCreate()
import spark.implicits._
val config = ConfigFactory.load()
val url = config.getString("db.default.url")
val user = config.getString("db.default.user")
val password = config.getString("db.default.password")
val driver = config.getString("db.default.driver")
val database = config.getString("db.default.database")
val table = "wc"
val connectionProperties = new Properties()
connectionProperties.put("user", user)
connectionProperties.put("password", password)
// TODO... 以上代码是加载
// TODO... 就是你们需要开发的业务逻辑功能
//1. 从mysql读取数据
val jdbcDF: DataFrame = spark.read
.jdbc(url, s"$database.$table", connectionProperties).filter($"cnt" > 11)
// TODO... 以下代码是Sink
val kuduMasters = "hadoop000"
// 自定义数据如何实现 load save
//2. 写入kudu
jdbcDF.write.mode(SaveMode.Append).format("org.apache.kudu.spark.kudu")
.option("kudu.master",kuduMasters)
.option("kudu.table", "pk")
.save()
//再从kudu读取出写入的数据
spark.read.format("org.apache.kudu.spark.kudu")
.option("kudu.master",kuduMasters)
.option("kudu.table", "app_stat_20181007")
.load().show()
spark.stop()
}
}