pyspark 与hbase的读写

2 篇文章 0 订阅

 在读取hbase的时候,由于hbase存储的是16进制的字节码,我这里用了各种方法,都无法在转换为df的时候,将其转换为中文,看了很多方法,各种decode, encode都没有找到合适的方法,如果有哪位同学,在看到这篇文章并解决了这个问题,麻烦告知我一声,谢谢

读取hbase 需将hbase下lib的几个常用包,软连接到spark的jars目录下

出现无法读取hive.sql的问题,将hive-site.xml文件连接到spark的conf目录下

spark-examples_2.11-1.6.0-typesafe-001.jar 下载地址:点此处

#Author:Dengwenxing
# -*- coding: utf-8 -*-
# @Time     :2019/12/30 15:09
# @Site     : 
# @fILE     : hbaseReader.py
# @Software :

import sys, os
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as fun
from pyspark.sql.functions import *
from pyspark.sql.types import *
import time, copy, re, math
from datetime import date
from datetime import datetime, timedelta
import json
import logging
from random import randint

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')
logger = logging.getLogger(__name__)

reload(sys)
sys.setdefaultencoding('utf-8')

warehouse_location = '/user/hive/warehouse/'
conf=SparkConf().set('spark.driver.maxResultSize', '10g')
conf.set('spark.yarn.executor.memoryOverhead', '30g')
conf.set('spark.yarn.am.cores', 5)
conf.set('spark.executor.memory', '40g')
conf.set('spark.executor.instances', 50)
conf.set('spark.executor.cores', 8)
conf.set('spark.executor.extraJavaOptions', '-XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+UseG1GC')
conf.set("spark.sql.warehouse.dir", warehouse_location)

spark = SparkSession \
    .builder \
    .config(conf=conf) \
    .enableHiveSupport() \
    .getOrCreate()

def setHbaseConf(ips=None,tableName=None,Znode="/hbase-unsecure",useing="input",rowStart=None,rowEnd=None):
    '''
    :param ips: [ip1,ip2,...]
    :param tableName: hbase tablename
    :param Znode: habse Znode in zookeeper
    :param rowStart: row start rowkey
    :param rowEnd: row end rowkey
    :return: hbaseConf
    '''
    if not ips:
        print("ips is null")
        sys.exit(1)
    if not tableName:
        print("tablename is null")
        sys.exit(1)
    ips = ','.join(ips)
    tableName = tableName
    if useing == "input":
        hbaseConf = {
            "hbase.zookeeper.quorum": ips,
            "hbase.mapreduce.inputtable": tableName,
            "zookeeper.znode.parent": Znode
        }
    else:
        hbaseConf = {
            "hbase.zookeeper.quorum": ips,
            "hbase.mapred.outputtable": tableName,
            "zookeeper.znode.parent": Znode,
            "mapreduce.outputformat.class": "org.apache.hadoop.hbase.mapreduce.TableOutputFormat",
            "mapreduce.job.output.key.class": "org.apache.hadoop.hbase.io.ImmutableBytesWritable",
            "mapreduce.job.output.value.class": "org.apache.hadoop.hbase.io.Writable"
        }
        return hbaseConf

    if rowStart is None or rowEnd is None:
        return hbaseConf
    else:
        hbaseConf["hbase.mapreduce.scan.row.start"] = rowStart
        hbaseConf["hbase.mapreduce.scan.row.end"] = rowEnd
        return hbaseConf

def hbaseRDD2DF():
    pass


def resule2df(rdd):
    def result2Dict(colsumns):
        result = {}
        rows = [json.loads(i) for i in colsumns]

        for row in rows:
            column = row["qualifier"]
            value = row["value"]
            result[column] = value
        return result

    def maketuple(colsumns):
        rowkey = [colsumns[0]]
        values = [colsumns[1][key] for key in colsumns[1]]
        return tuple(rowkey+values)

    res1 = rdd.map(lambda (k, v): (k, v.split('\n'))).map(lambda (k, v): (k, result2Dict(v)))
    columns = ["rowkey"] + res1.map(lambda x:[key for key in x[1]]).take(1)[0]
    # df = res1.map(maketuple).toDF(columns)
    res = res1.map(maketuple).toDF(columns)
    return res




def hbaseSimpleReader():
    ips = [***,***]
    hbaseConf = setHbaseConf(ips=ips, tableName="default:vertex_person")
    keyConv = "org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter"
    valueConv = "org.apache.spark.examples.pythonconverters.HBaseResultToStringConverter"

    hbase_rdd = spark.sparkContext.newAPIHadoopRDD(
        "org.apache.hadoop.hbase.mapreduce.TableInputFormat",
        "org.apache.hadoop.hbase.io.ImmutableBytesWritable",
        "org.apache.hadoop.hbase.client.Result",
        keyConverter=keyConv,
        valueConverter=valueConv,
        conf=hbaseConf
    )

    return resule2df(hbase_rdd)

def hbaseSimpleWriter(rdd):
    ips = [***,***]
    hbaseConf = setHbaseConf(ips=ips, tableName="vertex_person",useing="output")
    keyConv = "org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter"
    valueConv = "org.apache.spark.examples.pythonconverters.StringListToPutConverter"
    rdd.saveAsNewAPIHadoopDataset(conf=hbaseConf,keyConverter=keyConv,valueConverter=valueConv)

def run():
    df = spark.read.orc("path")
    def formatHbaseOutput(row):
        ''' 写入rdd的格式为(rowkey,[rowkey, col_family, column, value]) '''
        cf = 'info'
        cols = ["zjhm","xm","age"]
        rowkey = str(randint(1,9)) + row.zjhm
        zjhm = (rowkey,[rowkey,cf,cols[0],row.zjhm])
        xm = (rowkey,[rowkey,cf,cols[1],row.xm])
        age = (rowkey,[rowkey,cf,cols[2],row.age])
        result = [zjhm,xm,age]
        return result
    rdd = df.rdd.flatMap(formatHbaseOutput)
    hbaseSimpleWriter(rdd)




if __name__ == '__main__':
    logger.info( '================start time:%s' % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))

    hbaseSimpleReader()
    run()

    logger.info('=================end time:%s' % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))

    #spark-submit --py-files spark-examples_2.11-1.6.0-typesafe-001.jar --master yarn --deploy-mode  cluster --name hbaseReader  --driver-memory 40G --queue bbd_01 hbaseReader.py

 

  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值