1.使用Spark创建Hbase表,以及定义表属性
object HBaseCreateTable {
def main(args: Array[String]) {
val TABLE_NAME = "test_yuan"
val hBaseConf = HBaseConfiguration.create()
hBaseConf.set(HConstants.ZOOKEEPER_QUORUM, "bq2.bq.cn,bq1.bq.cn")
hBaseConf.set(HConstants.ZOOKEEPER_CLIENT_PORT, "2181")
val connect = ConnectionFactory.createConnection(hBaseConf)
val admin = connect.getAdmin
try {
if (admin.tableExists(TableName.valueOf(TABLE_NAME))) {
admin.disableTable(TableName.valueOf(TABLE_NAME));
admin.deleteTable(TableName.valueOf(TABLE_NAME));
}
//2\创建描述
val h_table = new HTableDescriptor(TableName.valueOf(TABLE_NAME));
val column = new HColumnDescriptor("base".getBytes());
//column.setBlockCacheEnabled(true)
//column.setBlocksize(2222222)
// 添加列簇
h_table.addFamily(column);
h_table.addFamily(new HColumnDescriptor("gps".getBytes()));
//3\创建表
admin.createTable(h_table)
val table = connect.getTable(TableName.valueOf(TABLE_NAME))
//插入5条数据
for (i <- 1 to 5) {
// 这里是主键
val put = new Put(Bytes.toBytes("row" + i))
// 必须添加到已经存在的列簇,列名可以不存在。
put.addColumn(Bytes.toBytes("base"), Bytes.toBytes("name"), Bytes.toBytes("value " + i))
put.addColumn(Bytes.toBytes("base"), Bytes.toBytes("famm"), Bytes.toBytes("value " + i))
table.put(put)
}
table.close()
} catch {
case ex: Exception => ex.printStackTrace()
} finally {
releaseConn(admin)
}
}
def releaseConn(admin: Admin) = {
try {
if (admin != null) {
admin.close();
}
} catch {
case ex: Exception => ex.getMessage
}
}
}
2.读取Hbase数据并写入ES
object HbaseToES {
def main(args: Array[String]): Unit = {
val zookeeper_quorum = "bq2.bq.cn,bq1.bq.cn"
val zookeeper_client_port = "2181"
val config = ConfigUtil.getConfig
val sparkConf = new SparkConf().setAppName("HbaseToES")
.set("es.nodes", config.getString("app.es.ips"))
.set("es.port", config.getString("app.es.port"))
.set("es.index.auto.create", "true")
.set("es.net.http.auth.user", config.getString("app.es.es_user_name"))
.set("es.net.http.auth.pass", config.getString("app.es.es_user_pass"))
val ssc = SparkSession.builder().appName("SparkFromHBase").master("local[*]").config(sparkConf).getOrCreate()
val sc = ssc.sparkContext
val tableName = "test_yuan"
val hBaseConf = HBaseConfiguration.create()
hBaseConf.set(HConstants.ZOOKEEPER_QUORUM, zookeeper_quorum)
hBaseConf.set(HConstants.ZOOKEEPER_CLIENT_PORT, zookeeper_client_port)
hBaseConf.set(org.apache.hadoop.hbase.mapreduce.TableInputFormat.INPUT_TABLE, tableName)
//读取数据并转化成rdd TableInputFormat是org.apache.hadoop.hbase.mapreduce包下的
val hbaseRDD = sc.newAPIHadoopRDD(hBaseConf, classOf[org.apache.hadoop.hbase.mapreduce.TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result])
val result = hbaseRDD.map(x => x._2).map { result =>
(result.getRow,
result.getValue(Bytes.toBytes("base"), Bytes.toBytes("name")),
result.getValue(Bytes.toBytes("base"), Bytes.toBytes("address")),
result.getValue(Bytes.toBytes("gps"), Bytes.toBytes("geohash")))
}.map(row => testInsert(new String(row._1), new String(row._2), new String(row._3), new String(row._4)))
println("数据量 " + result.count())
//result.take(10).foreach(println)
EsSpark.saveToEs(result, "test/hbase")
}
case class testInsert(row_id: String,
name: String,
address: String,
geohash: String)
}
3.从hive写入hbase
object HiveToHBase {
def main(args: Array[String]): Unit = {
val zookeeper_quorum = "bq2.bq.cn,bq1.bq.cn"
val zookeeper_client_port = "2181"
val TABLE_NAME = "test_yuan"TABLE_NAME)
val sparkConf = new SparkConf().setAppName("HiveToHBase")
.setMaster("local[*]")
val ssc = SparkSession.builder().config(sparkConf).enableHiveSupport().getOrCreate()
val dataFrame = ssc.sql("select mobiletelephone,customername,address,gps,geohash from graph.user_07_10 where mobiletelephone is not null limit 10")
dataFrame.show(10)
dataFrame.rdd.map(x => {
val phone = Try(x(0).asInstanceOf[String]).getOrElse("0")
val name = Try(x(1).asInstanceOf[String]).getOrElse("")
val address =Try(x(2).asInstanceOf[String]).getOrElse("")
val gps =Try(x(3).asInstanceOf[String]).getOrElse("")
val geohash =Try(x(4).asInstanceOf[String]).getOrElse("")
// rowkey
// 列簇、列、值
val p = new Put(Bytes.toBytes(phone))
p.addColumn(Bytes.toBytes("base"), Bytes.toBytes("name"), Bytes.toBytes(name))
p.addColumn(Bytes.toBytes("base"), Bytes.toBytes("address"), Bytes.toBytes(address))
p.addColumn(Bytes.toBytes("gps"), Bytes.toBytes("gps"), Bytes.toBytes(gps))
p.addColumn(Bytes.toBytes("gps"), Bytes.toBytes("geohash"), Bytes.toBytes(geohash))
}).foreachPartition(Iterator => {
//初始化jobconf,TableOutputFormat必须是org.apache.hadoop.hbase.mapred包下的!
val jobConf = new JobConf(HBaseConfiguration.create())
jobConf.set("hbase.zookeeper.quorum", zookeeper_quorum)
jobConf.set("hbase.zookeeper.property.clientPort", "2181")
// 走MapReduce OutputFormat
jobConf.setOutputFormat(classOf[TableOutputFormat])
val table = new HTable(jobConf, TableName.valueOf(TABLE_NAME))
import scala.collection.JavaConversions._
table.put(seqAsJavaList(Iterator.toSeq))
})
}
}