# 环境pip install pyhive thrift kazoo sasl thrift_sasl (yum -y install cyrus-sasl cyrus-sasl-devel cyrus-sasl-lib)
# 1.普通地址 jdbc:hive2://****:10000/default
# 2.基于zk的高可用地址 jdbc:hive2://****:2181,****:2181,*****:2181/default;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2
# 例子1:
import random
import re
from kazoo.client import KazooClient
import pandas as pd
from sqlalchemy import create_engine
def connect(jdbc_url, username, password, database_name):
"""
# 使用jdbc地址连接hive
"""
hive_server_urls = []
database = "default"
if jdbc_url.__contains__("zooKeeperNamespace"):
namespace = "hiveserver2"
pattern = re.compile(r'/([a-zA-Z0-9_-]+);')
match = pattern.match(jdbc_url)
if match is not None:
database = match.group(1)
zk_hosts = jdbc_url.replace("jdbc:hive2://", "", 1)
index1 = zk_hosts.index("/")
index2 = zk_hosts.index(";")
if index1 != -1 and index1 < index2:
zk_hosts = zk_hosts[0:index1]
else:
zk_hosts = zk_hosts[0: index2]
splits = zk_hosts.split(";")
for param in splits:
if param.startswith("zooKeeperNamespace"):
namespace = param.split("=")[1]
break
hive_server_urls = get_hive_server2_urls(zk_hosts, namespace)
else:
pattern = re.compile(r'jdbc:hive2://(.+)?/([a-zA-Z0-9_-]+)')
match = pattern.match(jdbc_url)
if match is not None:
hive_server_urls.append(match.group(1))
database = match.group(2)
else:
pattern = re.compile(r'jdbc:hive2://(.+)?/*')
match = pattern.match(jdbc_url)
if match is not None:
hive_server_urls.append(match.group(1))
if hive_server_urls.__len__() == 0:
raise Exception("jdbc_url 格式错误")
hive_count = hive_server_urls.__len__()
random.seed()
while hive_count > 0:
hive_count -= 1
index = random.randint(0, hive_count)
host_str = hive_server_urls.pop(index).split(":")
try:
source_db_info = {'user': username,
'host': host_str[0],
'port': host_str[1],
'db': database_name
}
con_source = create_engine(
'hive://%(user)s@%(host)s:%(port)s/%(db)s' %
source_db_info, encoding='utf-8')
return con_source
except Exception as e:
if hive_count > 1:
print("ERROR : Can not connect " + host_str[0] + ":" + host_str[1] + " .try another server...\n")
else:
print("ERROR : Can not connect hive_server2, please check the connection config or the hive_server")
raise e
def get_hive_server2_urls(zk_hosts, namespace):
"""
# 从zk获取 hive_server2地址 ***:2181,***:2181/default
"""
zk_client = KazooClient(hosts=zk_hosts)
zk_client.start()
result = zk_client.get_children(namespace)
zk_client.stop()
length = result.__len__()
host_list = list()
for i in range(0, length):
host_list.append(result[i].split("=")[1].split(";")[0])
return host_list
connect = connect(
"jdbc:hive2://***:2181,"
"***:2181/default;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2",
'dolphinscheduler', "asasassghgha", '**')
sql = "select description from %s where is_deleted = 0" % "***"
result1 = pd.read_sql_query(sql, connect)
print(result1)
python hive集群链接方式
最新推荐文章于 2023-08-25 08:49:28 发布