背景介绍
因厂里新换了大数据平台,现需进行数据迁移改造,并进行新旧平台的数据验证,现使用python进行hive连接,进行数据总数、分区、字段等指标的验证。
本文进行新旧hive方面的数据比较,如涉及其他数据库方面,修改连接方式即可;
本文仅从数据总量方面验证,分区、字段等指标请自行添加修改;
代码示例
#!/bin/env python
# -*- coding: utf-8 -*-
#########################################################
# 在新hadoop平台上执行
# hive_new:新hadoop平台
# hive_old:旧hadoop平台
#
# read_file:存储在新平台hdfs中的txt文件,用于读取比对表与相关指标
# 文件表头为 “业务,项目,比对表,总数,分区,字段”
# 使用逗号分隔
#
# insert_sql:将结果数据插入新平台结果表
#
#########################################################
import sys
sys.path.append('/data/pip')
from pyhive import hive
from datetime import datetime
import subprocess
def now_time():
# 获取当前时间并格式化
format_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
return format_time
# 读取文件
class read_file:
def getValueList(file_path):
line_list = []
cat = subprocess.Popen(["hdfs", "dfs", "-text", file_path], stdout=subprocess.PIPE, encoding='utf-8')
for line in cat.stdout:
if line != "":
line_list.append(line)
print( now_time() + f" ==== {file_path}内容为:" + str(line_list))
return line_list
# Hive连接
class HiveConn:
def __init__(self, host, port, username, database, auth):
print( now_time() + " ==== " + f"创建Hive连接!--- host:port --- {host}:{port}")
self.conn = hive.Connection(host=host,
port=port, # HiveServer2的默认端口
username=username,
database=database,
auth=auth,
# kerberos_service_name='hive'
) # 如果使用Kerberos,则可能是'KERBEROS'
def select_sql(self, sql):
cur = self.conn.cursor()
cur.execute(sql)
return cur.fetchall()
def insert_sql(self, sql):
cur = self.conn.cursor()
cur.execute(sql)
def close(self):
print( now_time() + " ==== " + "关闭Hive连接.")
self.conn.close()
def run():
try:
line_list = read_file.getValueList("hdfs://xxx.txt")
# 获取 比对表名
table_list = []
line_n = 0
for line_str in line_list:
line_n += 1
if line_n != 1:
line_v = line_str.split(",")
c_n = 0
for col_v in line_v:
c_n += 1
if c_n == 3:
table_list.append(col_v)
hive_new = HiveConn(host="xxx", port=xxx, username="xxx", database="xxx", auth="NONE")
hive_old = HiveConn(host="xxx", port=xxx, username="xxx", database="xxx", auth="NONE")
json_list = []
for table_name in table_list:
# sql = 'show databases'
sql = f"select count(*) from {table_name}"
print(now_time() + " ==== " + "执行 sql..." + sql)
result_new = hive_new.select_sql(sql)[0][0]
print(now_time() + " ==== " + "hive_new 执行结果...." + str(result_new))
result_old = hive_old.select_sql(sql)[0][0]
print( now_time() + " ==== " + "hive_old 执行结果...." + str(result_old))
result_json = {
"table": table_name,
"new": result_new,
"old": result_old
}
json_list.append(result_json)
insert_sql = "insert overwrite table test.check_data_result values "
for cnt in range(1, len(json_list) + 1):
if cnt != len(json_list):
insert_sql += f"('{json_list[cnt-1]['table']}' , '{json_list[cnt-1]['new']}', '{json_list[cnt-1]['old']}' , '{now_time()}'),"
else:
insert_sql += f"('{json_list[cnt-1]['table']}' , '{json_list[cnt-1]['new']}', '{json_list[cnt-1]['old']}' , '{now_time()}')"
print(now_time() + " ==== " + insert_sql)
hive_new.insert_sql(insert_sql)
print(now_time() + " ==== " + "hive_new insert执行完成....")
hive_new.close()
hive_old.close()
print( now_time() + " ==== " + "over....")
except Exception as e:
print(" Exception.... " + e)
sys.exit(1)
if __name__ == '__main__':
run()