有kerberos认证hbase在spark环境下的使用

hadoop中计算框架MapReduce中存储到有kerberos的hdfs,由于其内部yarn进行了认证故不需要进行相关的操作,可直接进行读写操作。

spark使用有kerberos认证的hbase是一个既麻烦又简单的问题,麻烦的方面是:中文的网站相关的文章很少并且分布只是分散的知识点。官网中给的信息也不够完整,倘若要是使用还是会出现自己采坑的想象。简单的方面是:代码量很少,理解起来也不是很难。本文kerberos认证的方式使用的是keytab方式

一、有kerberos认证的hbase的写过程

由于配置文件在maven编译后,打成jar包被driver分发到各个excutor的缓存区中,各配置文件properties,由于底层使用反射技术可以进入jar中读取配置供程序使用。但是keytab作为一个单独的文件,由于没有使用properties,程序只能从文件夹中进行读取,在文件中使用传统的kerberos认证就会出现找不到文件的情况。

有两种方式可以解决这种问题:1.在sparkcontext处,使用sc.addfile(""),可以将文件加入到excutor的内存中去,同时在hbase进行IO操作的代码处使用SparkFiles.get()即可获取内存中文件。2.spark-submit提交命令中使用 --files xx.keytab 将文件加入到excutor内存中,使用获取jar包路径的反射方式进而拼接出keytab的路径,即可获取keytab文件。

最后将经过认证的user,加入到connection中,然后使用这个connection即可对hbase进行写操作了。

二、有kerberos认证的hbase的读过程

使用写过程获取经过认证的user,在

sc.newAPIHadoopRDD(hconf, classOf[TableInputFormat], classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable]

中的TableInputFormat中创建用于读取hbase的connection。即继承重写TableInputFormat的代码中的connection,然后将自己定义的MyTableInputFormat装入此算子即可。

三、具体认证过程涉及的代码如下:

//此类是hbase的IO操作类,其中调用了kerberosUtile的相关认证方法。

import java.util.Properties

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.TableName
import org.apache.hadoop.hbase.client.HConnectionManager
import org.apache.hadoop.hbase.client.HTableInterface
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.SparkContext
import org.apache.spark.SparkFiles
import org.apache.spark.rdd.RDD

import cn.ctyun.UIDSS.hgraph.HGraphUtil
import cn.ctyun.UIDSS.utils.Hash
import cn.ctyun.UIDSS.utils.KerberorsJavaUtil
import cn.ctyun.UIDSS.utils.Logging

object HBaseIO extends Logging {

  def getGraphTableRDD(sc: SparkContext, props: Properties): RDD[(ImmutableBytesWritable, Result)] = {
    val hconf = HBaseConfiguration.create()

    //set zookeeper quorum
    hconf.set("hbase.zookeeper.quorum", props.getProperty("hbaseZkIp"));
    //set zookeeper port
    hconf.set("hbase.zookeeper.property.clientPort", props.getProperty("hbaseZkPort"));    
    hconf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    hconf.set("hbase.zookeeper.property.maxClientCnxns", props.getProperty("hbase_zookeeper_property_maxClientCnxns"));
    hconf.set("hbase.client.retries.number", props.getProperty("hbase_client_retries_number"));    
    hconf.addResource("core-site.xml")
    hconf.addResource("hbase-site.xml")
    hconf.addResource("hdfs-site.xml")

    //set which table to scan
    //===override the TableInputFormat to MyInputFormat added kerberos authentication===
    hconf.set(MyTableInputFormat.INPUT_TABLE, props.getProperty("hbaseTableName"))

    //println(getNowDate() + " ****** Start reading from HBase   ******")
    val rdd = sc.newAPIHadoopRDD(hconf, classOf[MyTableInputFormat], classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable], classOf[org.apache.hadoop.hbase.client.Result]).cache()
    //println(getNowDate() + " ****** Finished reading from HBase   ******")

    //遍历输出
    //    rdd.foreach {
    //      case (_, result) =>
    //        val key = Bytes.toString(result.getRow.drop(2))
    //        //println("Row key:" + key)
    //        for (c <- result.rawCells()) {
    //          val dst = Bytes.toString(c.getQualifier)
    //          var value = 0
    //          try {
    //            value = Bytes.toInt(c.getValue)
    //          } catch {
    //            case _: Throwable =>
    //          }
    //          //println("        column is: " + dst + " ;  value is: " + value)
    //        }
    //    }
    rdd
  }

  def saveToGraphTable(sc: SparkContext, props: Properties, rddToSave: RDD[((String, String), String)]): Int = {
    info("------Writing data to Graph table start--------")
    var rddToSavePartition = rddToSave
    
    val partNumHBaseO = props.getProperty("rddPartNumHBaseO").toInt
//    if (partNumHBaseO > 0) {
//      rddToSavePartition = rddToSave.repartition(partNumHBaseO)
//      val cnt= rddToSavePartition.count().toString() 
//      info(" ******  Writing " + cnt + " rows to HBase ******")
//      println(" ******  Writing " + cnt + " rows to HBase ******")
//    } 
    
    //多分区并行输出
    info("------foreachPartition write data start--------")
    rddToSavePartition.foreachPartition {
      //一个分区内的所有行     
      case (rows) =>
        //println("        column is: " + this.getClass.getClassLoader().getResource(""))
        val hconf = HBaseConfiguration.create()
        info("---------each partition create HBaseConfiguration-----------")
        //set zookeeper quorum
        hconf.set("hbase.zookeeper.quorum", props.getProperty("hbaseZkIp"))
        //set zookeeper port
        hconf.set("hbase.zookeeper.property.clientPort", props.getProperty("hbaseZkPort"))           
        hconf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
        hconf.set("hbase.zookeeper.property.maxClientCnxns", props.getProperty("hbase_zookeeper_property_maxClientCnxns"))
        hconf.set("hbase.client.retries.number", props.getProperty("hbase_client_retries_number"))
        hconf.set("hbase.client.pause", "1000")
        hconf.set("zookeeper.recovery.retry", "3")
        
        hconf.addResource("core-site.xml")
        hconf.addResource("hbase-site.xml")
        hconf.addResource("hdfs-site.xml")   
        //=========get HBase authenticated user==========
        val loginedUser = KerberorsJavaUtil.getAuthenticatedUser(hconf,props,props.getProperty("keytabFile"))
        val connection = HConnectionManager.createConnection(hconf,loginedUser)
        info("------HBase connection is created--------")
        val htable: HTableInterface = connection.getTable(TableName.valueOf(props.getProperty("hbaseTableName")))

        //批量写入
        val flushInBatch = props.getProperty("flushInBatch")
        val sWaitForHBase = props.getProperty("waitForHBase")
        val batchSize = props.getProperty("batchSize")
        
        var waitForHBase = 0
        if (flushInBatch != null && "1".compareToIgnoreCase(flushInBatch) == 0) {
          htable.setAutoFlushTo(false);
          htable.setWriteBufferSize(1024 * 1024 * batchSize.toInt);
          if (sWaitForHBase != null && sWaitForHBase.toInt > 0) {
            waitForHBase = sWaitForHBase.toInt
          }
        }

        //println(getNowDate() + " ****** Start writing to HBase   ******")

        var rowCount = 0 
        
//        for (row <- rows.toArray) (
        info("------HBase write data start--------")
        for (row <- rows) (
          {
            //row  ((行,列),值)) 
            var src: String =  Hash.getHashString(row._1._1) + row._1._1 
            var dst: String = row._1._2
            var prop: Int = row._2.toInt
            //println("Row is: " + src + " ;column is: " + dst + " ; value is: " + prop)

            val put = new Put(Bytes.toBytes(src))
            put.add(HGraphUtil.COLUMN_FAMILY, Bytes.toBytes(dst), Bytes.toBytes(prop))
            put.setWriteToWAL(false)
            htable.put(put)
            
            rowCount = rowCount +1

            //降低写入速度
            if ((rowCount % 1000)==0 && waitForHBase >0) { Thread.sleep(waitForHBase)}
          })
        //println(getNowDate() + " ****** Finished writing to HBase   ******")  
        try{
          info("=======prepare to flushCommits======")
          htable.flushCommits()
          info("=======flushCommits finished======")
        }catch {
          case e: Exception =>
          info("=======flushCommits failed=======")
        }
        htable.close();          
        //println(getNowDate() + " ****** Flushed  to HBase   ******")
        info("------HBase write data finished--------")
    }
    1
  }
}
//此类是kerberos认证的工具类,其中对user进行相关的kerberos认证

import java.io.IOException;
import java.util.Properties;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.security.User;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.log4j.Logger;

public class KerberorsJavaUtil {
	private static final Logger LOG = Logger.getLogger(KerberorsJavaUtil.class);
	
	public static void getHBaseAuthentication(Configuration hconf,Properties props,String keytabFile){
		//get the keytab file path which added from spark submit "--files"
		String keyFilePath = KerberorsJavaUtil.class.getResource("/").getPath();
		LOG.info("=====file path====="+keyFilePath);
	    if(keyFilePath.startsWith("file")){
	    	keyFilePath = keyFilePath.substring(5);
	    }
	    //method "loginUserFromKeytab" required keyFilePath like "AAA/XXX/./keyFile"
	    keyFilePath = keyFilePath+"./"+keytabFile;
		LOG.info("------Start Get HBaseAuthentication-----");
		System.setProperty("java.security.krb5.conf",props.getProperty("krb5ConfDir"));
		hconf.set("hbase.security.authentication","kerberos");  
		hconf.set("hadoop.security.authentication","Kerberos");
		//hdfs-site.xml中namenode principal配置信息
		hconf.set("hbase.master.kerberos.principal",props.getProperty("masterPrin")); 
		//hdfs-site.xml中datanode principal配置信息
		hconf.set("hbase.regionserver.kerberos.principal",props.getProperty("regionPrin"));  
		UserGroupInformation.setConfiguration(hconf);  
	    try {
	    	//kerberos 认证 ,指定认证用户及keytab文件路径。
	    	LOG.info("------dev_yx.keytab path is---"+keyFilePath);
	    	UserGroupInformation.loginUserFromKeytab(props.getProperty("userName"),keyFilePath);
	    	LOG.info("------Get HBaseAuthentication Successed-----");
	    } catch (Exception e) {  
	        LOG.error("Get HBaseAuthentication Failed",e);  
	    }
	   
	}
	
	public static User getAuthenticatedUser(Configuration hconf,Properties props,String keytabFile){
		getHBaseAuthentication(hconf,props,keytabFile);		
		User loginedUser = null;
	    try {
	    	LOG.info("=====put the logined userinfomation to user====");
			loginedUser = User.create(UserGroupInformation.getLoginUser());
		} catch (IOException e) {
			LOG.error("===fialed put the logined userinfomation to user===",e);
		}	    
	    return loginedUser;
	}
	
}
//此类继承了TableInputFormatBase,使用kerberosUtile重写了其中的connection方法,获得经过认证的user,即可对hbase进行读操作


import java.io.IOException;
import java.util.Collections;
import java.util.List;
import java.util.Properties;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.classification.InterfaceStability;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.RegionLocator;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.mapreduce.TableInputFormatBase;
import org.apache.hadoop.hbase.security.User;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.util.StringUtils;

import cn.ctyun.UIDSS.utils.KerberorsJavaUtil;

/**
 * Convert HBase tabular data into a format that is consumable by Map/Reduce.
 */
@InterfaceAudience.Public
@InterfaceStability.Stable
public class MyTableInputFormat extends TableInputFormatBase
implements Configurable {

  @SuppressWarnings("hiding")
  private static final Log LOG 
  • 1
    点赞
  • 11
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值