Hbase如何通过Kerberos认证获取数据

        最近在做一个项目,需要先拉取hbase上面的数据,然后进行指标计算。

        但是集群Hadoop生态圈都设置了Kerberos认证,故记录此文,解决Kerberos认证过程中的问题。

        代码需要先获取Hbase连接,而连接需要先通过Kerberos认证。(详细关注kerberos()这个方法)。然后拿着连接去遍历Hbase表,表名是根据当前时间生成的。然后根据rowkey和column添加过滤器,遍历拿出相关数据。

        注意://cluster模式下测试连接HBase时一定要使用UGI的doAs方法,把连接写在里面,这样就可以避免连接不上报GSS错误。(详见代码最后面)

​
package com.bigdata.process;

/**
 * author: DahongZhou
 * Date:
 */


import com.bigdata.bean.HbaseBean;
import com.bigdata.utils.db.GenerateTableName;
import com.bigdata.utils.Logger;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.filter.*;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;

import java.io.IOException;
import java.security.PrivilegedAction;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.Executors;

/**
 * scan hbase中的数据,并封装成dataframe
 */
public class HbaseProcessed {
    static Logger logger = Logger.getLogger(HbaseProcessed.class);
    private static Connection connection;
    static Configuration conf = null;
    static String tableName1 = "AMR:DWD_AMR_CQ_METER-E-DAY_201806";
    static String tableName_pre = "AMR:DWD_AMR_CQ_METER-E-DAY_";
    static String hbaseStart = "201701";
    static DataFrame dfs = null;
    static JavaSparkContext javaSparkContext = null;
    static SQLContext sqlContext = null;
    static DataFrame temp_df = null;


    /**
     * 将hbase数据union为一张总dataframe
     *
     * @param javaSparkContext
     * @param sqlContext
     * @return hbase表数据的dataFrame
     * @throws ParseException
     * @throws IOException
     */
    public static DataFrame getDfs(JavaSparkContext javaSparkContext, SQLContext sqlContext,
                                   String rowKey, String colunm) throws ParseException, IOException {
        //创建空的dataframe用来union全部hbase表的dataframe
        ArrayList<String> tempList = new ArrayList<>();
        JavaRDD<String> temp_rdd = javaSparkContext.parallelize(tempList, 1);
        temp_df = sqlContext.createDataFrame(temp_rdd, HbaseBean.class);

        //将空的带有schema的dataframe赋值给dfs,不然在unionAll的时候会出错
        dfs = temp_df;

        //获取全部表名
        ArrayList<String> tables = GenerateTableName.getTables();

        //获取每个表,将每个hbase表union得到一个大的dataframe
        for (int i = 0; i < tables.size(); i++) {
            DataFrame df = generateDataFrame(javaSparkContext, sqlContext, tables.get(i), rowKey, colunm);
            dfs = dfs.unionAll(df);
        }

        return dfs;
    }


    /**
     *
     * @param javaSparkContext
     * @param sqlContext
     * @param tableName
     * @return
     * @throws IOException
     */
//    public static DataFrame getHbaseDataFrame(JavaSparkContext javaSparkContext, SQLContext sqlContext, String tableName) throws IOException {
//        return generateDataFrame(javaSparkContext, sqlContext, tableName);
//    }

    /**
     * 利用Scan对Hasee表进行筛选并放入到RDD里面然后生成dataFrame
     *
     * @param javaSparkContext
     * @param sqlContext
     * @param tableName
     * @return hbase表数据筛选后的dataFrame
     * @throws IOException
     */
    public static DataFrame generateDataFrame(JavaSparkContext javaSparkContext, SQLContext sqlContext,
                                              String tableName, String rowKey, String colunm) throws IOException {
        DataFrame df = null;
        // 1)创建一个hbase的connection
        //默认会读取配置文件hbase-site.xml

        connection = getConnection();
        //String tableName = "AMR:DWD_AMR_CQ_METER-E-DAY_201912";
        String split = ","; //读取hbase的数据,并用该分隔符来分割,包括rowkey、列名(电量表id)、值(电量数)


        /**
         * 并不知道rowkey的具体值,获取全表的数据。如果数据量巨大,这个方法非常消耗性能。
         *   一般配合,选取范围或filter使用
         */
        logger.warn("-----------扫描全表的数据-------------");
        Scan scan = new Scan();

        //添加数据筛选的范围,为拼接的rowkey范围
        String[] rowkeyDateArr = gennerateRowkeyDateArr(tableName);
        scan.setStartRow(Bytes.toBytes(rowKey + "." + rowkeyDateArr[0]));
        scan.setStopRow(Bytes.toBytes(rowKey + "." + rowkeyDateArr[1]));

        //设置scan的过滤器:只scan得到rowkey中以“PAP_E”结尾的数据
        Filter suffix_filter = new RowFilter(CompareFilter.CompareOp.EQUAL,
                new RegexStringComparator(".*PAP_E$"));


        //rowkey以包含传入的用户编号,其对应的台区编码和供电单位编号的数据
        ArrayList<Filter> listForFilters = new ArrayList<Filter>();

        listForFilters.add(suffix_filter);      //添加rowkey后缀过滤器


        Filter filter = new RowFilter(CompareFilter.CompareOp.EQUAL, new SubstringComparator(rowKey));
        listForFilters.add(filter); //遍历添加‘台区编码和供电单位编号的数据’

        SubstringComparator substringComparator = new SubstringComparator(colunm);
        Filter columnFilter = new QualifierFilter(CompareFilter.CompareOp.EQUAL, substringComparator);
        listForFilters.add(columnFilter);

        Filter filters = new FilterList(FilterList.Operator.MUST_PASS_ALL, listForFilters);
        Scan scans = scan.setFilter(filters);

        scans.setMaxResultSize(50);
        scans.setCaching(100);
        scans.setBatch(5);
        scan.setCacheBlocks(false);// no read cache
        scan.addFamily(Bytes.toBytes("DATA"));


        logger.warn("开始获取表。。。");
        //获取表
        Table table = connection.getTable(TableName.valueOf(tableName));
        logger.warn("connection.toString() is :==>" + connection.toString());
        logger.warn("table.toString() is :==>" + table.toString());

        ResultScanner scanner = null;
        boolean tableExist = isTableExist(tableName, connection);
        if (tableExist) {  //判断表是否存在
            logger.warn(tableName + " isTableExist:  " + tableExist);
            scanner = table.getScanner(scans);
            logger.warn(" scanner 对象:  " + scanner);

            Result result = null;
            ArrayList<String> list = new ArrayList<>();
            while ((result = scanner.next()) != null) {
                logger.warn(" result 对象:  " + result);

                String rowkey = "";
                // String family = "";
                String column = "";
                String value = "";
                List<Cell> cells = result.listCells();  //获取到hbase表的Cells
                for (Cell cell : cells) {   //遍历Cells,并封装到List<String>集合中
                    // 列簇、列名、值、rowkey
                    // 打印rowkey,family,qualifier,value
                    rowkey = Bytes.toString(CellUtil.cloneRow(cell));
                    //family = Bytes.toString(CellUtil.cloneFamily(cell));
                    column = Bytes.toString(CellUtil.cloneQualifier(cell));
                    value = Bytes.toString(CellUtil.cloneValue(cell));
                    if (column.isEmpty() || column.equals("null")) {
                        column = "0";//空值设置为"0"
                    }
                    if (value.isEmpty() || value.equals("null")) {
                        value = "0";//空值设置为"0"
                    }
                    list.add(rowkey + split + column + split + value);

                    //System.out.println(rowkey + split + family + split + column + split + value);
                }
            }

            logger.warn("hbase表" + tableName + "数据:" + list);

            //通过list集合的大小来设置rdd的分区数(分区的个数尽量等于集群中的CPU核心(core)数目)
            int list_size = list.size();
            int partitions = 1;  //分区数,默认为1
            int per_partitions = 1000;  //每个分区的数据条数
            if (list_size > per_partitions) {
                partitions = (int) (list_size / per_partitions) + 1;
            }
            // System.out.println(tableName + "表,共有:" + list_size + "条数据");
            // System.out.println("rdd的分区数为:" + partitions);
            // 通过list,生成rdd
            JavaRDD<String> rdd = javaSparkContext.parallelize(list, partitions);
            // System.out.println("rdd count "+rdd.count());

            //将rdd封装成HbaseBean类型,此处调用mapPartitions(在每个分区中处理,能提高性能)
            JavaRDD<HbaseBean> rowRDD = rdd.mapPartitions(new FlatMapFunction<Iterator<String>, HbaseBean>() {
                @Override
                public Iterable<HbaseBean> call(Iterator<String> stringIterator) throws Exception {
                    ArrayList<HbaseBean> arrayList = new ArrayList<HbaseBean>();

                    while (stringIterator.hasNext()) {  //遍历rdd每一行,并封装成HbaseBean,然后添加到ArrayList<HbaseBean>
                        String next = stringIterator.next();
                        String parts[] = next.split(",");
                        HbaseBean bean = new HbaseBean();
                        bean.setRowkey(parts[0]);
                        // bean.setFamily(parts[1]);
                        bean.setDlb_id(parts[1]);
                        bean.setDlb_dl(parts[2]);
                        arrayList.add(bean);
                    }
                    return arrayList;
                }
            });
            // System.out.println("rowRDD count "+rowRDD.count());

            // System.out.println(tableName+"---------out tableName---------");
//            if (!GenerateTableName.nowMonthTable(tableName)) {
//                //如果hbase表是上个月,则将hbase的数据rdd checkpoint起来,方便后面调用
//                // System.out.println(tableName+"---------inner tableName---------");
//                // 在checkpoint之前,需要先cache,可以提高性能。因为checkpoint会用一个新的线程去计算rdd,然后再存储到磁盘中
//                rowRDD.cache();
//                rowRDD.checkpoint();
//                rowRDD.unpersist();
//            }

            //创建hbase表的DataFrame
            df = sqlContext.createDataFrame(rowRDD, HbaseBean.class);
            if (df == null) {
                df = temp_df;   //将空的带有schema的dataframe赋值给dfs,不然在unionAll的时候会出错
            }
        } else df = temp_df;


        // df.cache();
        // connection.close();
        // df.show(100,false);
//        scanner.close();
//        table.close();
        return df;
    }


    /**
     * 判断表是否存在
     *
     * @param tableName
     * @return 各表是否存在
     * @throws IOException
     */

    public static boolean isTableExist(String tableName, Connection connection) throws IOException {
        connection = getConnection();
        Admin admin = connection.getAdmin();

        return admin.tableExists(TableName.valueOf(tableName));
    }

    /**
     * kerberos认证,用户为USER_SYX,密钥文件为服务器上的/app/keytab/USER_SYX.keytab,配置文件为服务器上的/etc/krb5.conf
     *
     * @return
     */
    public static Configuration kerberos() throws IOException {
        Configuration HBASE_CONFIG = new Configuration();
        HBASE_CONFIG.set("hbase.zookeeper.quorum", "node01,node02,node03");
        HBASE_CONFIG.set("hbase.zookeeper.property.clientPort", "2181");
        HBASE_CONFIG.set("hbase.rpc.timeout", "1800000");
        HBASE_CONFIG.set("hbase.master.kerberos.principal", "hbase/_HOST@tsq.com");
        HBASE_CONFIG.set("hbase.regionserver.kerberos.principal", "hbase/_HOST@tsq.com");
        HBASE_CONFIG.set("hbase.security.authentication", "kerberos");
        HBASE_CONFIG.set("hadoop.security.authentication", "kerberos");
        HBASE_CONFIG.set("zookeeper.znode.parent", "/hbase");
        conf = HBaseConfiguration.create(HBASE_CONFIG);

        //添加hbase-site配置文件,此方式不行,因为也需要先通过hdfs的kerberos认证
//         hbase_site_file = "hdfs://node01:8020/user/hbase/hbase-site.xml";
//         krb5Path = "hdfs://node01:8020/user/hbase/krb5.conf";
//         keytabPath = "hdfs://node01:8020/user/hbase/USER_SYX.keytab";

//        String principal = "USER_xxx@tsq.com";
//        String principal = "hbase/USER_xxx@tsq.com";
        String principal = "USER_xxx";
//        String principal = "_HOST@tsq.com";


        //--files会把文件上传到hdfs的.sparkStagin/applicationId目录下,System.getenv("SPARK_YARN_STAGING_DIR")能获取到这个目录

        // krb5.conf这个文件是从远程服务器上copy下来的
        System.setProperty("java.security.krb5.conf", "/etc/krb5.conf");

        // 这个hbase.keytab也是从远程服务器上copy下来的, 里面存储的是密码相关信息
        // 这样我们就不需要交互式输入密码了
//        conf.set("keytab.file", "/app/keytab/USER_SYX.keytab");
        // 这个可以理解成用户名信息,也就是Principal
//        conf.set("kerberos.principal", "hbase/USER_SYX@tsq.com");
        UserGroupInformation.setConfiguration(conf);
        logger.warn("-------------kerberos conf添加完成");

        // SparkFiles.get获取的目录是driver node下的本地目录
//        String xmlFile = SparkFiles.get("/app/test/config/hbase-site.xml");
//        String confFile = SparkFiles.get("/app/test/config/krb5.conf");
//        String keytabFile = SparkFiles.get("/app/test/config/USER_SYX.keytab");
//        logger.warn(xmlFile + "=====" + confFile + "=====" + keytabFile);


        try {
            UserGroupInformation.loginUserFromKeytab(principal, "/app/keytab/USER_SYX.keytab");
            logger.warn("-------------loginUserFromKeytab完成");

        } catch (IOException e) {
            e.printStackTrace();
            System.exit(1);
        }
        logger.warn("********** HBase Succeeded in authenticating through Kerberos! **********");

        return conf;
    }

    /**
     * 与hbase建立连接
     *
     * @return
     * @throws IOException
     */
    public static Connection getConnection() throws IOException {
        if (connection == null) {
            // 集群启用了kerberos认证
            conf = kerberos();//集群启用了kerberos认证,没有认证的话,将这行注释掉即可

            // kerberos认证集群必须传递已经认证过的conf
//            conf.set("hbase.zookeeper.quorum", "node01,node02,node03");
//            conf.set("hbase.zookeeper.property.clientPort", "2181");


            //cluster模式下测试连接HBase时一定要使用UGI的doAs方法,把连接写在里面,这样就可以避免连接不上报GSS错误。
            UserGroupInformation ugi = UserGroupInformation.loginUserFromKeytabAndReturnUGI("USER_xxx", "/app/keytab/USER_SYX.keytab");

            connection = ugi.doAs(new PrivilegedAction<Connection>() {
                                      @Override
                                      public Connection run() {
                                          try {
                                              HbaseProcessed.connection = ConnectionFactory.createConnection(conf, Executors.newFixedThreadPool(4));
                                          } catch (IOException e) {
                                              logger.warn("PrivilegedAction!! ===>" + e);
                                          }
                                          return HbaseProcessed.connection;
                                      }
                                  }
            );

        }
        return connection;
    }


    /**
     * 根据hbase的表名,来拼接scan的起始rowkey和结束rowkey
     *
     * @param tableName hbase的表名
     * @return 起始rowkey和结束rowkey组成的数组,即{起始rowkey,结束rowkey}
     */
    private static String[] gennerateRowkeyDateArr(String tableName) {
        // AMR:DWD_AMR_CQ_METER-E-DAY_201911
        String[] tableNameArr = tableName.split("_");
        String year_month = tableNameArr[tableNameArr.length - 1];
        String year = year_month.substring(0, 5);
        String month = year_month.substring(5);

        System.out.println("year month is " + year + month);
        //NA.0410354369.01202019000000.PAP_E
        //startRowkey返回时间格式为month day year 000000
        //stopRowkey返回时间格式为month day year 999999
        String startRow = month + "00" + year + "000000" + ".PAP_E";
        String stopRow = month + "31" + year + "999999" + ".PAP_E";
        String[] rowkeyRangeArr = {startRow, stopRow};
        System.out.println(rowkeyRangeArr);
        return rowkeyRangeArr;
    }

}


​

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值