python脚本:(get_hive_count.py )
def get_total_everyDay(from_table,hive_from):
spark = SparkSession.builder.master("yarn").appName("get %s hive count" % hive_from).enableHiveSupport().getOrCreate()
sql = "select part_date, count(*) as count from %s group by part_date order by part_date" % from_table
spark.sql(sql).repartition(1).write.parquet( "/user/abc/count_compare/%s_count_%s.parquet" % (
hive_from, from_table.split(".")[1]), mode="overwrite")
print("get %s finish-table hive count OK!" % from_table)
另外一个脚本:equal_compare.py,读取一个parquet文件
#对结果文件进行比较,有part_date字段的表 def check_result_for_tableHavePartDate(sh_hive_count_path, bj_hive_count_path, from_table): print("%s******************start of comparison!******************" % from_table) spark = SparkSession.builder.