python读写hive_python读取hive数据库-CSDN博客

本文链接：https://blog.csdn.net/WangYouJin321/article/details/126221842

直连方式1：

from pyhive import hive
import thrift
import sasl
import thrift_sasl
print("hello dog")
conn = hive.Connection(host='master', port=10000, database='hive_dlp_data_test',auth='NONE')
cursor=conn.cursor()
cursor.execute('show tables')
print(cursor.fetchall())
print("hello AAA")
# for result in cursor.fetchall():
#     print("hello BBB")
#     print(result)
#     print("hello CCC")
# cursor.execute('select * from hive_dlp_data_test.sink_dlp_hive limit 6')
# print(cursor.fetchall())
cursor.execute("select * from hive_dlp_data_test.sink_dlp_hive where ts_date='20211021' and ts_hour='19' limit 6")
print(cursor.fetchall())
print("BBBB")
cursor.execute("SELECT * FROM sink_dlp_hive WHERE ts_date = '20211021' and ts_hour = '19' LIMIT 2")

print(cursor.fetchall())

print("CCC")
cursor.execute("SELECT * FROM sink_dlp_hive WHERE ts_date = '20211021' and ts_hour = '19' LIMIT 2")

print(cursor.fetchall())
print("DDD test count :")
cursor.execute("SELECT * FROM sink_dlp_hive WHERE ts_date = '20211021' and ts_hour = '19' LIMIT 2")

直连方式2：


import pandas as pd
from pyhive import hive
import thrift
import sasl
import thrift_sasl

database_name= "hive_dlp_data_test"
table_name = "sink_dlp_hive"
conn = hive.Connection(host='master', port=10000, database=database_name, auth='NONE')
cursor = conn.cursor()
cursor.execute('show tables')
print("tables:",cursor.fetchall())
cursor.execute('show databases')
print("databases:",cursor.fetchall())
cursor.execute('SELECT COUNT(*) FROM hive_dlp_data_test.sink_dlp_hive WHERE TS_DATE = 20211021 AND TS_HOUR = 18')
print("count test:",cursor.fetchall())

def fetch_data(database_name, table_name, ts_date, ts_hour, limit=None):
    print("databases:",database_name)
    try:
        describe_query = 'describe extended ' + table_name
        cursor.execute(describe_query)
        colnames = []
        for result in cursor.fetchall():
            colnames.append(result[0])

        if limit:
            lmt_cmd = ' LIMIT ' + str(limit)
        else:
            lmt_cmd = ''
        select_query = 'SELECT * FROM ' + table_name + ' WHERE ts_date = ' + ts_date + ' and ts_hour = ' + ts_hour + lmt_cmd
        print("select_query:",select_query)
        df = pd.read_sql_query(select_query, conn)
        print(type(df))
        df.columns = colnames[:df.shape[1]]
        return df

    except Exception as e:
        print("ts_date, ts_hour:",ts_date, ts_hour)
        print("except ERROR:",e)
        # if conn:
        #     conn.close()


for day in range(20, 22):
    for hour in range(0, 24):

        if hour < 10:
            hour_str = '0' + str(hour)
        else:
            hour_str = str(hour)
        print("date,hour:",day,hour)
        df = fetch_data(database_name, table_name, '202110' + str(day), hour_str)
        print("result:",day, hour, df.shape)

if conn:
    conn.close()

通过pyflink链接：

from pyflink.table import *
from pyflink.table.catalog import HiveCatalog

env_settings = EnvironmentSettings.new_instance().in_batch_mode().use_blink_planner().build()
tableEnv = BatchTableEnvironment.create(environment_settings=env_settings)

print("AAA")
catalog_name = "myhive"
default_database = "hive_tmp"
hive_conf_dir = "/run/software/hive/conf"  # a local path
print("BBB")

hive_catalog = HiveCatalog(catalog_name, default_database, hive_conf_dir)
tableEnv.register_catalog("myhive", hive_catalog)
print("CCC")

# set the HiveCatalog as the current catalog of the session
tableEnv.use_catalog("myhive")
print("DDD")
# to use hive dialect
tableEnv.get_config().set_sql_dialect(SqlDialect.HIVE)

print(tableEnv.list_databases())


print(tableEnv.execute_sql("SHOW TABLES").print())
print("EEE")
#tableEnv.execute_sql("CREATE DATABASE IF NOT EXISTS hive_tmp1")
#tableEnv.execute_sql("DROP TABLE IF EXISTS hive_tmp1.analytics_access_log_hive")
print("FFF")
tableEnv.execute_sql("show partitions sink_parent_info").print()

#msck repair table ods_user_behavior_new;
print("FFF")
#tableEnv.execute_sql("msck repair table hive_tmp.sink_parent_info")
#tableEnv.execute_sql("msck repair table hive_tmp.kafkadata")

print("GGG")
tableEnv.execute_sql("show partitions hive_tmp.sink_parent_info").print()