使用到的Url为:
/clusters/{clusterName}/services/{serviceName}/impalaQueries
需要导入的函数库:
#encoding:utf-8
import urllib.request
import base64
import json
import time
import datetime
import pytz
import pymysql
用到的全局变量:
user='admin'
password='admin'
# url = 'http://10.100.100.11:7180/api/v7/clusters/cluster2/services/impala/impalaQueries?from=2018-06-25T08%3A00%3A00' \
# '.000Z&to=2018-06-25T09%3A00%3A00.000Z&filter= '
LOCAL_FORMAT = "%Y-%m-%d %H:%M:%S.%f"
LOCAL_FORMAT_ = "%Y-%m-%d %H:%M:%S"
#mysql连接参数
sql_host = '127.0.0.1'
sql_port = 3306
sql_user = 'root'
sql_pass = 'root'
sql_db = 'test'
db_connect = None
#sql语句
sql_insert = 'insert into impala_query_record values (\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\")'
sql_select = 'select * from impala_query_record where queryId = \"%s\"'
对其进行初始化
'''
动态生成url
'''
def getCurrentUrl():
now_ = local_to_utc(time.time()).replace(':','%3A')
before_ = local_to_utc((datetime.datetime.now()-datetime.timedelta(hours=1)).timestamp()).replace(':','%3A')
return 'http://host:7180/api/v7/clusters/cluster2/services/impala/impalaQueries?from=' + before_ +'&to='+now_+'&filter= '
local_to_utc:将时间戳转换为utc格式(url中的请求时间必须为utc格式)
'''
将时间戳转换为utc格式的时间
@:param
local_ts:时间戳
utc_format:utc时间格式模板
'''
def local_to_utc(local_ts, utc_format='%Y-%m-%dT%H:%M:%SZ'):
local_tz = pytz.timezone('Asia/Shanghai')
local_format = "%Y-%m-%d %H:%M:%S"
time.localtime(local_ts)
time_str = time.strftime(local_format, time.localtime(local_ts))
dt = datetime.datetime.strptime(time_str, local_format)
local_dt = local_tz.localize(dt, is_dst=None)
utc_dt = local_dt.astimezone(pytz.utc)
return utc_dt.strftime(utc_format)
得到url后可以发起http请求获取导出的数据:
'''
添加请求header并通过url导出json数据
'''
def exporyData(url):
//根据username 和 password 生成 base64string
base64string = base64.encodestring(('%s:%s' % (user, password)).encode()).decode().replace('\n', '')
request = urllib.request.Request(url)
//将base64string添加到http请求头中
request.add_header("Authorization","Basic %s" % base64string)
result = urllib.request.urlopen(request)
html = result.read()
result.close()
data=json.loads(html.decode('utf-8'))
return data
这里需要着重理解的是 base64 ,因为该 API 使用了 http 基本认证,需要在 Authorization header 放入基于 base64 加密的用户名和密码,中间用“:”连接,服务器将 Authorization header 中的用户名密码取出,进行验证,如果验证通过,将根据请求,发送资源给客户端。
base64 模块真正用的上的方法只有8个,分别是 encode, decode, encodestring, decodestring, b64encode,b64decode, urlsafe_b64decode,urlsafe_b64encode。他们8个可以两两分为4组,encode, decode 一组,专门用来编码和 解码文件的,也可以对 StringIO 里的数据做编解码;encodestring,decodestring 一组,专门用来编码和解码字符串; b64encode和b64decode 一组,用来编码和解码字符串,并且有一个替换符号字符的功能。这个功能是这样的:因为 base64 编码后的字符除 了英文字母和数字外还有三个字符 + / =, 其中 = 只是为了补全编码后的字符数为 4 的整数,而 + 和 / 在一些情况下需要被替换的, b64encode 和 b64decode 正是提供了这样的功能。至于什么情况下 + 和 / 需要被替换,最常见的就是对 url 进行 base64 编码的时候。urlsafe_b64encode 和 urlsafe_b64decode 一组,这个就是用来专门对 url 进行 base64 编解码的,实际上也是调用的前一组函数。
通过上面函数可以导出一个map函数,接下来对map函数进行解析
因为导出数据中的时间参数为utc格式,需要将其转换为本地时间
'''
将utc格式的时间转换为毫秒级时间戳
@:param
utc_time_str:utc格式时间
utc_format:utc时间格式模板
'''
def utc2local(utc_time_str, utc_format='%Y-%m-%dT%H:%M:%S.%fZ'):
if utc_time_str == None:
return None
local_tz = pytz.timezone('Asia/Shanghai')
utc_dt = datetime.datetime.strptime(utc_time_str, utc_format)
local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz)
time_str = local_dt.strftime(LOCAL_FORMAT)
return int(datetime.datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S.%f').timestamp()*1000)
将时间戳转换为本地时间
#因为是毫秒级,故需要/1000变为秒级
time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(startTimestamp/1000))
执行Mysql插入逻辑,将数据持久化到磁盘
'''
向MySQL中插入数据
'''
def insert(queryId,statement,queryType,queryState,startTime,endTime,database,session_id,session_type,memory_per_node_peak_node,coordinator_hostId,oom,client_fetch_wait_time,memory_aggregate_peak,planning_wait_time,hdfs_bytes_read,query_duration):
global db_connect
#判断queryid是否存在
select = sql_select % queryId
insert = sql_insert % (
queryId, statement.replace("\"","\'"), queryType, queryState, startTime, endTime, database, session_id, session_type,
memory_per_node_peak_node, coordinator_hostId, oom, client_fetch_wait_time, memory_aggregate_peak,
planning_wait_time, hdfs_bytes_read,query_duration)
if db_connect == None:
openMysqlConnect()
cursor = db_connect.cursor()
print(insert)
execute = cursor.execute(select)
if execute == 0:
cursor.execute(insert)
db_connect.commit()
此时数据已成功持久化,接下来可以通过mysql对数据进行分析操作。
main函数:
'''
主函数
'''
if __name__ == '__main__':
url = getCurrentUrl()
data = exporyData(url)
#查询正确信息
queries = data['queries']
#遍历信息,将数据格式化
for list in queries:
startTime = list.get('startTime')
endTime = list.get('endTime')
#获取时间戳
startTimestamp = utc2local(startTime)
endTimestamp = utc2local(endTime)
durationTime = getDurationTime(startTimestamp,endTimestamp)
time.localtime(startTimestamp/1000)
startTime = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(startTimestamp/1000))
endTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(startTimestamp/1000))
attributes_ = list['attributes']
#插入语句
insert(list.get('queryId'), list.get('statement'), list.get('queryType'), list.get('queryState'),
startTime, endTime, list.get('database'), attributes_.get('session_id'),
attributes_.get('session_type'), attributes_.get('memory_per_node_peak_node'),
list.get('coordinator').get('hostId'), attributes_.get('oom'), attributes_.get('client_fetch_wait_time'),
attributes_.get('memory_aggregate_peak'), attributes_.get('planning_wait_time'), attributes_.get('hdfs_bytes_read')
,durationTime)
全部的逻辑如下:
#encoding:utf-8
import urllib.request
import base64
import json
import time
import datetime
import pytz
import pymysql
user='admin'
password='admin'
# url = 'http://10.100.100.11:7180/api/v7/clusters/cluster2/services/impala/impalaQueries?from=2018-06-25T08%3A00%3A00' \
# '.000Z&to=2018-06-25T09%3A00%3A00.000Z&filter= '
LOCAL_FORMAT = "%Y-%m-%d %H:%M:%S.%f"
LOCAL_FORMAT_ = "%Y-%m-%d %H:%M:%S"
#mysql连接参数
sql_host = '127.0.0.1'
sql_port = 3306
sql_user = 'root'
sql_pass = 'root'
sql_db = 'test'
db_connect = None
#sql语句
sql_insert = 'insert into impala_query_record values (\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\")'
sql_select = 'select * from impala_query_record where queryId = \"%s\"'
'''
将utc格式的时间转换为时间戳
@:param
utc_time_str:utc格式时间
utc_format:utc时间格式模板
'''
def utc2local(utc_time_str, utc_format='%Y-%m-%dT%H:%M:%S.%fZ'):
if utc_time_str == None:
return None
local_tz = pytz.timezone('Asia/Shanghai')
utc_dt = datetime.datetime.strptime(utc_time_str, utc_format)
local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz)
time_str = local_dt.strftime(LOCAL_FORMAT)
return int(datetime.datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S.%f').timestamp()*1000)
'''
将时间戳转换为utc格式的时间
@:param
local_ts:时间戳
utc_format:utc时间格式模板
'''
def local_to_utc(local_ts, utc_format='%Y-%m-%dT%H:%M:%SZ'):
local_tz = pytz.timezone('Asia/Shanghai')
local_format = "%Y-%m-%d %H:%M:%S"
time.localtime(local_ts)
time_str = time.strftime(local_format, time.localtime(local_ts))
dt = datetime.datetime.strptime(time_str, local_format)
local_dt = local_tz.localize(dt, is_dst=None)
utc_dt = local_dt.astimezone(pytz.utc)
return utc_dt.strftime(utc_format)
'''
动态生成url
'''
def getCurrentUrl():
now_ = local_to_utc(time.time()).replace(':','%3A')
before_ = local_to_utc((datetime.datetime.now()-datetime.timedelta(hours=1)).timestamp()).replace(':','%3A')
return 'http://host:7180/api/v7/clusters/cluster2/services/impala/impalaQueries?from=' + before_ +'&to='+now_+'&filter= '
'''
添加请求header并通过url导出json数据
'''
def exporyData(url):
base64string = base64.encodestring(('%s:%s' % (user, password)).encode()).decode().replace('\n', '')
request = urllib.request.Request(url)
request.add_header("Authorization","Basic %s" % base64string)
result = urllib.request.urlopen(request)
html = result.read()
result.close()
data=json.loads(html.decode('utf-8'))
return data
'''
获取时间戳的差值
'''
def getDurationTime(start, end):
if(start == None or end == None):
return None
return end - start
'''
连接mysql
'''
def openMysqlConnect():
global db_connect
db_connect = pymysql.connect(host=sql_host, port=sql_port, user=sql_user, passwd=sql_pass, db=sql_db, charset='utf8')
'''
向MySQL中插入数据
'''
def insert(queryId,statement,queryType,queryState,startTime,endTime,database,session_id,session_type,memory_per_node_peak_node,coordinator_hostId,oom,client_fetch_wait_time,memory_aggregate_peak,planning_wait_time,hdfs_bytes_read,query_duration):
global db_connect
#判断queryid是否存在
select = sql_select % queryId
insert = sql_insert % (
queryId, statement.replace("\"","\'"), queryType, queryState, startTime, endTime, database, session_id, session_type,
memory_per_node_peak_node, coordinator_hostId, oom, client_fetch_wait_time, memory_aggregate_peak,
planning_wait_time, hdfs_bytes_read,query_duration)
if db_connect == None:
openMysqlConnect()
cursor = db_connect.cursor()
print(insert)
execute = cursor.execute(select)
if execute == 0:
cursor.execute(insert)
db_connect.commit()
'''
主函数
'''
if __name__ == '__main__':
url = getCurrentUrl()
data = exporyData(url)
#查询正确信息
queries = data['queries']
#遍历信息,将数据格式化
for list in queries:
startTime = list.get('startTime')
endTime = list.get('endTime')
#获取时间戳
startTimestamp = utc2local(startTime)
endTimestamp = utc2local(endTime)
durationTime = getDurationTime(startTimestamp,endTimestamp)
time.localtime(startTimestamp/1000)
startTime = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(startTimestamp/1000))
endTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(startTimestamp/1000))
attributes_ = list['attributes']
#插入语句
insert(list.get('queryId'), list.get('statement'), list.get('queryType'), list.get('queryState'),
startTime, endTime, list.get('database'), attributes_.get('session_id'),
attributes_.get('session_type'), attributes_.get('memory_per_node_peak_node'),
list.get('coordinator').get('hostId'), attributes_.get('oom'), attributes_.get('client_fetch_wait_time'),
attributes_.get('memory_aggregate_peak'), attributes_.get('planning_wait_time'), attributes_.get('hdfs_bytes_read')
,durationTime)