Spark从hbase 并发取数据,并用spark-shell提交

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.{Base64, Bytes}
import org.apache.spark.sql.SparkSession
import org.slf4j.LoggerFactory
import org.apache.hadoop.hbase.client.{Scan,Result}
import org.apache.hadoop.hbase.protobuf.ProtobufUtil
import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp
import org.apache.hadoop.hbase.filter.{ FilterList, SingleColumnValueFilter,SubstringComparator}

System.setProperty("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

var ss: SparkSession = null
ss = SparkSession.builder().enableHiveSupport().getOrCreate()
val sc = ss.sparkContext

val hbaseColumns = Array("c:tt", "c:ad_final_url","c:url","c:delivery_countries","c:la","c:ad_picture", "c:campaign_source", "c:ad_merchant_id")

val queryColumns = hbaseColumns.map(c => c).mkString(" ")

val hBaseConfiguration = HBaseConfiguration.create()
//hBaseConfiguration.set(TableInputFormat.INPUT_TABLE, "bidword_ad_pps_daily") //hbase 中的表
hBaseConfiguration.set(TableInputFormat.INPUT_TABLE, "table_name") //hbase 中的表
hBaseConfiguration.set(TableInputFormat.SCAN_COLUMNS, queryColumns)
//添加过滤条件
val filterListOR1 = new FilterList(FilterList.Operator.MUST_PASS_ONE)
val filterListOR2 = new FilterList(FilterList.Operator.MUST_PASS_ONE)
val filterListAND = new FilterList(FilterList.Operator.MUST_PASS_ALL)
val scan = new Scan()
val filter1=new SingleColumnValueFilter("c".getBytes,"campaign_source".getBytes,CompareOp.EQUAL,Bytes.toBytes("k"))
val filter2=new SingleColumnValueFilter("c".getBytes,"campaign_source".getBytes,CompareOp.EQUAL,Bytes.toBytes("ag"))
filterListOR1.addFilter(filter1)
filterListOR1.addFilter(filter2)
val  filter3 = new SingleColumnValueFilter("c".getBytes,"delivery_countries".getBytes,CompareOp.EQUAL,new SubstringComparator("ww"))
val  filter4 = new SingleColumnValueFilter("c".getBytes,"delivery_countries".getBytes,CompareOp.EQUAL,new SubstringComparator("sg"))
filterListOR2.addFilter(filter3)
filterListOR2.addFilter(filter4)
filterListAND.addFilter(filterListOR1)
filterListAND.addFilter(filterListOR2)


scan.setFilter(filterListAND)
hBaseConfiguration.set(TableInputFormat.SCAN,Base64.encodeBytes(ProtobufUtil.toScan(scan).toByteArray))

val sqlContext = ss.sqlContext
import sqlContext.implicits._

val hbaseRDD = sc.newAPIHadoopRDD(hBaseConfiguration, classOf[TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result])

val dataRDD = hbaseRDD.map({ case (_, result) =>
    val item_id = Bytes.toString(result.getRow) 
    val ad_final_url = Bytes.toString(result.getValue(Bytes.toBytes("c"), Bytes.toBytes("ad_final_url")))          
    val image = Bytes.toString(result.getValue(Bytes.toBytes("c"), Bytes.toBytes("ad_picture")))
    val campaign_source = Bytes.toString(result.getValue(Bytes.toBytes("c"), Bytes.toBytes("campaign_source")))
    val delivery_countries = Bytes.toString(result.getValue(Bytes.toBytes("c"), Bytes.toBytes("delivery_countries")))
    val language = Bytes.toString(result.getValue(Bytes.toBytes("c"), Bytes.toBytes("la")))
    val title = Bytes.toString(result.getValue(Bytes.toBytes("c"), Bytes.toBytes("tt")))
    val url = Bytes.toString(result.getValue(Bytes.toBytes("c"), Bytes.toBytes("url")))
    val ad_merchant_id=Bytes.toString(result.getValue(Bytes.toBytes("c"), Bytes.toBytes("ad_merchant_id")))
    //(title,ad_final_url)
    Array(item_id,ad_final_url,image,campaign_source,delivery_countries,language,title,url,ad_merchant_id).mkString("\001")
})



//dataRDD.toDF().show(20,false)
//println(dataRDD.count())



dataRDD.saveAsTextFile("hdfs://path")

直接查询

scan 'table_name',{LIMIT=>5}


scan 'table_name',{COLUMNS => ['c:tt', 'c:ad_final_url'],LIMIT=>20}

shell提交



spark-shell \
--name "fetch_hbase_test" \
--master yarn-client \
--num-executors 4 \
--executor-cores 2 \
--executor-memory 3G \
--driver-memory 5G \
--conf spark.driver.maxResultSize=10g \
--conf spark.yarn.executor.memoryOverhead=10000 \
--conf spark.serializer="org.apache.spark.serializer.KryoSerializer" \
--conf spark.shuffle.memoryFraction=0.3 \
--conf spark.sql.shuffle.partitions=1000 \
--conf spark.default.parallelism=1000 \
--jars $path/hbase-client-1.0.2.jar \
--conf spark.hbase.obtainToken.enabled=true \
--conf spark.yarn.security.credentials.hbase.enabled=true \
--files $path/conf/hdfs-site.xml

通常引用hbase-client-1.0.2.jar就可以了,但有些jar包不知啥问题,引用以下几个才行:

--jars $path/spark-hbase_2.11-2.3.2.jar \
--jars $path/hbase-hadoop-compat-1.3.1.jar \
--jars $path/hbase-common-1.3.1.jar \
--jars $path/hbase-client-1.3.1-6407.jar \
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值