主要是操作HBase
spark集成hbase 缺少类 可在spark-env.sh中设置SPARK——CLASSPASTH
SPARK_CLASSPATH=/opt/hbase/lib/*
如果报java.lang.NoSuchMethodError或java.lang.NoSuchFieldError则可能是jar冲突,依赖的jar单独放在一个目录,不要有重复的jar
操作程序如下
import org.apache.spark.SparkContext
import org.apache.spark._
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.HTableDescriptor
import org.apache.hadoop.hbase.client.HBaseAdmin
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.HColumnDescriptor
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.client.HTable
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.client.Delete
object SparkHBase1 extends Serializable {
def main(args: Array[String]) {
val sc = new SparkContext("spark://centos.host1:7077", "SparkHBase")
val conf = HBaseConfiguration.create()
conf.set("hbase.zookeeper.property.clientPort", "2181")
conf.set("hbase.zookeeper.quorum", "centos.host1")
conf.set("hbase.master", "centos.host1:60000")
conf.addResource("/home/hadoop/software/hbase-0.92.2/conf/hbase-site.xml")
conf.set(TableInputFormat.INPUT_TABLE, "user")
val admin = new HBaseAdmin(conf)
if (!admin.isTableAvailable("test")) {
print("Table Not Exists! Create Table")
val tableDesc = new HTableDescriptor("test")
tableDesc.addFamily(new HColumnDescriptor("basic".getBytes()))
admin.createTable(tableDesc)
}
//Put操作
val table = new HTable(conf, "user");
for (i <- 1 to 5) {
var put = new Put(Bytes.toBytes("row" + i))
put.add(Bytes.toBytes("basic"), Bytes.toBytes("name"), Bytes.toBytes("value " + i))
table.put(put)
}
table.flushCommits()
//Delete操作
val delete = new Delete(Bytes.toBytes("row1"))
table.delete(delete)
//Scan操作
val hbaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
classOf[org.apache.hadoop.hbase.client.Result])
val count = hbaseRDD.count()
println("HBase RDD Count:" + count)
hbaseRDD.cache()
val res = hbaseRDD.take(count.toInt)
for (j <- 1 until count.toInt) {
println("j: " + j)
var rs = res(j - 1)._2
var kvs = rs.raw
for (kv <- kvs)
println("rowkey:" + new String(kv.getRow()) +
" cf:" + new String(kv.getFamily()) +
" column:" + new String(kv.getQualifier()) +
" value:" + new String(kv.getValue()))
}
System.exit(0)
}
}
操作脚本如下(前面部分)
scala> import org.apache.spark._
import org.apache.spark._
scala> import org.apache.spark.rdd.NewHadoopRDD
import org.apache.spark.rdd.NewHadoopRDD
scala> import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configuration
scala> import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HBaseConfiguration
scala> import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
scala> val configuration = HBaseConfiguration.create(); //初始化配置
configuration: org.apache.hadoop.conf.Configuration = Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hbase-default.xml, hbase-site.xml
scala> configuration.set("hbase.zookeeper.property.clientPort", "2181"); //设置zookeeper client端口
scala> configuration.set("hbase.zookeeper.quorum", "localhost"); //设置zookeeper quorum
scala> configuration.set("hbase.master", "localhost:60000"); //设置hbase master
scala> configuration.addResource("/home/victor/software/hbase/conf/hbase-site.xml") //将hbase的配置加载
scala> import org.apache.hadoop.hbase.client.HBaseAdmin
import org.apache.hadoop.hbase.client.HBaseAdmin
scala> val hadmin = new HBaseAdmin(configuration);