第一个参数是数据库名字
第二个参数是表的名字
#!/bin/python
#[START] DESCRIPTION
#@param the first argument is the name of database
#@param the second argument is the name of table
#[END] DESCRIPTION
#[START]conf
hive_bin = "/home/work/lib/hive_udw2/bin/hive"
#[END]conf
import sys
import subprocess
arglen = len(sys.argv)
if (arglen < 3) :
print "too less arguments"
exit(1)
db = sys.argv[1]
table = sys.argv[2]
def run_hive_cmd(hive_cmd):
cmd = "%s -e '%s'" % (hive_bin, hive_cmd)
print "run sehll command : %s" % (cmd)
res = subprocess.Popen(cmd,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE,close_fds=True);
out=res.stdout.readlines();
print "output of shell command is "
print out
for k, v in enumerate(out):
out[k] = v.rstrip("\n")
return out
def get_fields(rows):
ans = []
for k, v in enumerate(rows):
f = v.split(" ")[0].strip()
ans.append(f)
return ans;
def get_group(db, table, field):
hive_cmd = "USE %s ; SELECT %s, count(1) FROM %s GROUP BY %s" % (db, field, table, field)
ans = run_hive_cmd(hive_cmd)
print "[START] output group of field %s (database is %s, table is %s)" % (field, db, table)
print "%s %s" % ('value', 'num')
for k, v in enumerate(ans):
tmp = v.split(" ")
value = tmp[0]
num = tmp[1]
print "%s %s" % (value, num)
print "[END] output group of field %s (database is %s, table is %s)" % (field, db, table)
cmd = "USE %s ; DESC %s" % (db, table)
fields = run_hive_cmd(cmd)
fields = get_fields(fields)
for k, v in enumerate(fields):
get_group(db, table, v)