python hive集群链接方式

291 篇文章 2 订阅
8 篇文章 0 订阅
# 环境pip install  pyhive thrift kazoo sasl thrift_sasl  (yum -y install cyrus-sasl cyrus-sasl-devel cyrus-sasl-lib)
# 1.普通地址 jdbc:hive2://****:10000/default
# 2.基于zk的高可用地址 jdbc:hive2://****:2181,****:2181,*****:2181/default;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2

# 例子1:

import random
import re
from kazoo.client import KazooClient
import pandas as pd
from sqlalchemy import create_engine


def connect(jdbc_url, username, password, database_name):
    """
    # 使用jdbc地址连接hive
    """
    hive_server_urls = []
    database = "default"
    if jdbc_url.__contains__("zooKeeperNamespace"):
        namespace = "hiveserver2"
        pattern = re.compile(r'/([a-zA-Z0-9_-]+);')
        match = pattern.match(jdbc_url)
        if match is not None:
            database = match.group(1)
        zk_hosts = jdbc_url.replace("jdbc:hive2://", "", 1)
        index1 = zk_hosts.index("/")
        index2 = zk_hosts.index(";")
        if index1 != -1 and index1 < index2:
            zk_hosts = zk_hosts[0:index1]
        else:
            zk_hosts = zk_hosts[0: index2]
        splits = zk_hosts.split(";")
        for param in splits:
            if param.startswith("zooKeeperNamespace"):
                namespace = param.split("=")[1]
                break
        hive_server_urls = get_hive_server2_urls(zk_hosts, namespace)
    else:
        pattern = re.compile(r'jdbc:hive2://(.+)?/([a-zA-Z0-9_-]+)')
        match = pattern.match(jdbc_url)
        if match is not None:
            hive_server_urls.append(match.group(1))
            database = match.group(2)
        else:
            pattern = re.compile(r'jdbc:hive2://(.+)?/*')
            match = pattern.match(jdbc_url)
            if match is not None:
                hive_server_urls.append(match.group(1))
    if hive_server_urls.__len__() == 0:
        raise Exception("jdbc_url 格式错误")
    hive_count = hive_server_urls.__len__()
    random.seed()
    while hive_count > 0:
        hive_count -= 1
        index = random.randint(0, hive_count)
        host_str = hive_server_urls.pop(index).split(":")
        try:
            source_db_info = {'user': username,
                              'host': host_str[0],
                              'port': host_str[1],
                              'db': database_name
                              }
            con_source = create_engine(
                'hive://%(user)s@%(host)s:%(port)s/%(db)s' %
                source_db_info, encoding='utf-8')
            return con_source
        except Exception as e:
            if hive_count > 1:
                print("ERROR : Can not connect " + host_str[0] + ":" + host_str[1] + " .try another server...\n")
            else:
                print("ERROR : Can not connect hive_server2, please check the connection config or the hive_server")
                raise e


def get_hive_server2_urls(zk_hosts, namespace):
    """
    # 从zk获取 hive_server2地址 ***:2181,***:2181/default
    """
    zk_client = KazooClient(hosts=zk_hosts)
    zk_client.start()
    result = zk_client.get_children(namespace)
    zk_client.stop()
    length = result.__len__()
    host_list = list()
    for i in range(0, length):
        host_list.append(result[i].split("=")[1].split(";")[0])
    return host_list


connect = connect(
    "jdbc:hive2://***:2181,"
    "***:2181/default;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2",
    'dolphinscheduler', "asasassghgha", '**')
sql = "select description from %s where is_deleted = 0" % "***"
result1 = pd.read_sql_query(sql, connect)
print(result1)

第一步 正在表达式抽取zk

第二部 通过zk获取单点链接地址

第三步 用循环的方式挨个链接单点地址,正确则返回,错误则循环知道访问完所有单点

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值