python hive集群链接方式

最新推荐文章于 2023-08-25 08:49:28 发布

愚昧之山绝望之谷开悟之坡

最新推荐文章于 2023-08-25 08:49:28 发布

阅读量988

点赞数 1

分类专栏：工具 python hive 文章标签： hive python big data

本文链接：https://blog.csdn.net/qq_15821487/article/details/120153612

版权

python 同时被 3 个专栏收录

344 篇文章 10 订阅

订阅专栏

工具

291 篇文章 2 订阅

订阅专栏

hive

8 篇文章 0 订阅

订阅专栏

# 环境pip install  pyhive thrift kazoo sasl thrift_sasl  (yum -y install cyrus-sasl cyrus-sasl-devel cyrus-sasl-lib)
# 1.普通地址 jdbc:hive2://****:10000/default
# 2.基于zk的高可用地址 jdbc:hive2://****:2181,****:2181,*****:2181/default;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2

# 例子1:

import random
import re
from kazoo.client import KazooClient
import pandas as pd
from sqlalchemy import create_engine


def connect(jdbc_url, username, password, database_name):
    """
    # 使用jdbc地址连接hive
    """
    hive_server_urls = []
    database = "default"
    if jdbc_url.__contains__("zooKeeperNamespace"):
        namespace = "hiveserver2"
        pattern = re.compile(r'/([a-zA-Z0-9_-]+);')
        match = pattern.match(jdbc_url)
        if match is not None:
            database = match.group(1)
        zk_hosts = jdbc_url.replace("jdbc:hive2://", "", 1)
        index1 = zk_hosts.index("/")
        index2 = zk_hosts.index(";")
        if index1 != -1 and index1 < index2:
            zk_hosts = zk_hosts[0:index1]
        else:
            zk_hosts = zk_hosts[0: index2]
        splits = zk_hosts.split(";")
        for param in splits:
            if param.startswith("zooKeeperNamespace"):
                namespace = param.split("=")[1]
                break
        hive_server_urls = get_hive_server2_urls(zk_hosts, namespace)
    else:
        pattern = re.compile(r'jdbc:hive2://(.+)?/([a-zA-Z0-9_-]+)')
        match = pattern.match(jdbc_url)
        if match is not None:
            hive_server_urls.append(match.group(1))
            database = match.group(2)
        else:
            pattern = re.compile(r'jdbc:hive2://(.+)?/*')
            match = pattern.match(jdbc_url)
            if match is not None:
                hive_server_urls.append(match.group(1))
    if hive_server_urls.__len__() == 0:
        raise Exception("jdbc_url 格式错误")
    hive_count = hive_server_urls.__len__()
    random.seed()
    while hive_count > 0:
        hive_count -= 1
        index = random.randint(0, hive_count)
        host_str = hive_server_urls.pop(index).split(":")
        try:
            source_db_info = {'user': username,
                              'host': host_str[0],
                              'port': host_str[1],
                              'db': database_name
                              }
            con_source = create_engine(
                'hive://%(user)s@%(host)s:%(port)s/%(db)s' %
                source_db_info, encoding='utf-8')
            return con_source
        except Exception as e:
            if hive_count > 1:
                print("ERROR : Can not connect " + host_str[0] + ":" + host_str[1] + " .try another server...\n")
            else:
                print("ERROR : Can not connect hive_server2, please check the connection config or the hive_server")
                raise e


def get_hive_server2_urls(zk_hosts, namespace):
    """
    # 从zk获取 hive_server2地址 ***:2181,***:2181/default
    """
    zk_client = KazooClient(hosts=zk_hosts)
    zk_client.start()
    result = zk_client.get_children(namespace)
    zk_client.stop()
    length = result.__len__()
    host_list = list()
    for i in range(0, length):
        host_list.append(result[i].split("=")[1].split(";")[0])
    return host_list


connect = connect(
    "jdbc:hive2://***:2181,"
    "***:2181/default;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2",
    'dolphinscheduler', "asasassghgha", '**')
sql = "select description from %s where is_deleted = 0" % "***"
result1 = pd.read_sql_query(sql, connect)
print(result1)

第一步正在表达式抽取zk

第二部通过zk获取单点链接地址

第三步用循环的方式挨个链接单点地址，正确则返回，错误则循环知道访问完所有单点

愚昧之山绝望之谷开悟之坡

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
python hive集群链接方式

# 环境pip install pyhive thrift kazoo sasl thrift_sasl (yum -y install cyrus-sasl cyrus-sasl-devel cyrus-sasl-lib)# 1.普通地址 jdbc:hive2://****:10000/default# 2.基于zk的高可用地址 jdbc:hive2://****:2181,****:2181,*****:2181/default;serviceDiscoveryMode=zooKeeper;zo
复制链接

扫一扫