object WriteHbase { def main(args: Array[String]): Unit = { //获取conf val conf=HBaseConfiguration.create() val tablenamein = args(0) val tablenameout = args(1) //设置读取的表 conf.set(TableInputFormat.INPUT_TABLE,tablenamein) //设置写入的表 conf.set(TableOutputFormat.OUTPUT_TABLE,tablenameout) val sparkConf=new SparkConf() sparkConf.setAppName("read and write for hbase ") sparkConf.setMaster("local[3]") val sc=new SparkContext(sparkConf) //为job指定输出格式和输出表名 val newAPIJobConfiguration1 = Job.getInstance(conf) newAPIJobConfiguration1.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, tablenameout) newAPIJobConfiguration1.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]]) //全量读取hbase表 val rdd=sc.newAPIHadoopRDD(conf,classOf[TableInputFormat] ,classOf[ImmutableBytesWritable] ,classOf[Result] ) //过滤空数据,然后对每一个记录做更新,并转换成写入的格式 val final_rdd= rdd.map(forDatas) //转换后的结果,再次做过滤 val save_rdd=final_rdd.filter(checkNull) //最终在写回hbase表 final_rdd.saveAsNewAPIHadoopDataset(newAPIJobConfiguration1.getConfiguration) sc.stop() } //作用:过滤掉空列簇的数据 def checkNotEmptyKs(f:((ImmutableBytesWritable,Result))):Boolean={ val r=f._2 val rowkey=Bytes.toString(r.getRow) val map:scala.collection.mutable.Map[Array[Byte],Array[Byte]]= r.getFamilyMap(Bytes.toBytes("f")).asScala val map1:scala.collection.mutable.Map[Array[Byte],Array[Byte]]= r.getFamilyMap(Bytes.toBytes("h")).asScala val map2:scala.collection.mutable.Map[Array[Byte],Array[Byte]]= r.getFamilyMap(Bytes.toBytes("mtdt")).asScala if(map.isEmpty || map1.isEmpty || map2.isEmpty) false else true } //作用:读取每一条数据,做update后,在转化成写入操作 def forDatas(f: (ImmutableBytesWritable,Result)): (ImmutableBytesWritable,Put)={ val r=f._2 //获取Result val put:Put=new Put(r.getRow) val ks=Bytes.toBytes("f") val ks1=Bytes.toBytes("h") val ks2=Bytes.toBytes("mtdt") val map:scala.collection.mutable.Map[Array[Byte],Array[Byte]]= r.getFamilyMap(ks).asScala map.foreach(kv=>{ val kid= Bytes.toString(kv._1) var value=Bytes.toString(kv._2) put.add(ks,kv._1,Bytes.toBytes(value)) } ) val map1:scala.collection.mutable.Map[Array[Byte],Array[Byte]]= r.getFamilyMap(ks2).asScala map1.foreach(kv=>{ val kid= Bytes.toString(kv._1) var value=Bytes.toString(kv._2) put.add(ks2,kv._1,Bytes.toBytes(value)) } ) val map2:scala.collection.mutable.Map[Array[Byte],Array[Byte]]= r.getFamilyMap(ks1).asScala map2.foreach(kv=>{ val kid= Bytes.toString(kv._1) var value=Bytes.toString(kv._2) put.add(ks2,kv._1,Bytes.toBytes(value)) } ) if(put.isEmpty) null else (new ImmutableBytesWritable(),put) } //checkNull 作用:过滤最终结果里面的null数据 def checkNull(f:((ImmutableBytesWritable,Put))):Boolean={ if(f==null) false else true } }
Spark读取Hbase
最新推荐文章于 2023-08-14 20:48:56 发布