python 处理 Impala 日志

2 篇文章 0 订阅
1 篇文章 0 订阅

使用到的Url为:
/clusters/{clusterName}/services/{serviceName}/impalaQueries

需要导入的函数库:

#encoding:utf-8
import urllib.request
import base64
import json
import time
import datetime
import pytz
import pymysql

用到的全局变量:

user='admin'
password='admin'
# url = 'http://10.100.100.11:7180/api/v7/clusters/cluster2/services/impala/impalaQueries?from=2018-06-25T08%3A00%3A00' \
#       '.000Z&to=2018-06-25T09%3A00%3A00.000Z&filter= '
LOCAL_FORMAT = "%Y-%m-%d %H:%M:%S.%f"
LOCAL_FORMAT_ = "%Y-%m-%d %H:%M:%S"
#mysql连接参数
sql_host = '127.0.0.1'
sql_port = 3306
sql_user = 'root'
sql_pass = 'root'
sql_db = 'test'
db_connect = None

#sql语句
sql_insert = 'insert into impala_query_record values (\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\")'
sql_select = 'select * from impala_query_record where queryId = \"%s\"'

对其进行初始化

'''
    动态生成url
'''
def getCurrentUrl():
    now_ = local_to_utc(time.time()).replace(':','%3A')
    before_ = local_to_utc((datetime.datetime.now()-datetime.timedelta(hours=1)).timestamp()).replace(':','%3A')
    return 'http://host:7180/api/v7/clusters/cluster2/services/impala/impalaQueries?from=' + before_ +'&to='+now_+'&filter= '

local_to_utc:将时间戳转换为utc格式(url中的请求时间必须为utc格式)

'''
    将时间戳转换为utc格式的时间
     @:param
        local_ts:时间戳
        utc_format:utc时间格式模板
'''
def local_to_utc(local_ts, utc_format='%Y-%m-%dT%H:%M:%SZ'):
    local_tz = pytz.timezone('Asia/Shanghai')
    local_format = "%Y-%m-%d %H:%M:%S"
    time.localtime(local_ts)
    time_str = time.strftime(local_format, time.localtime(local_ts))
    dt = datetime.datetime.strptime(time_str, local_format)
    local_dt = local_tz.localize(dt, is_dst=None)
    utc_dt = local_dt.astimezone(pytz.utc)
    return utc_dt.strftime(utc_format)

得到url后可以发起http请求获取导出的数据:

'''
    添加请求header并通过url导出json数据
'''
def exporyData(url):
    //根据username 和 password 生成 base64string
    base64string = base64.encodestring(('%s:%s' % (user, password)).encode()).decode().replace('\n', '')
    request = urllib.request.Request(url)
    //将base64string添加到http请求头中
    request.add_header("Authorization","Basic %s" % base64string)
    result = urllib.request.urlopen(request)
    html = result.read()
    result.close()
    data=json.loads(html.decode('utf-8'))
    return data

这里需要着重理解的是 base64 ,因为该 API 使用了 http 基本认证,需要在 Authorization header 放入基于 base64 加密的用户名和密码,中间用“:”连接,服务器将 Authorization header 中的用户名密码取出,进行验证,如果验证通过,将根据请求,发送资源给客户端。

base64 模块真正用的上的方法只有8个,分别是 encode, decode, encodestring, decodestring, b64encode,b64decode, urlsafe_b64decode,urlsafe_b64encode。他们8个可以两两分为4组,encode, decode 一组,专门用来编码和 解码文件的,也可以对 StringIO 里的数据做编解码;encodestring,decodestring 一组,专门用来编码和解码字符串; b64encode和b64decode 一组,用来编码和解码字符串,并且有一个替换符号字符的功能。这个功能是这样的:因为 base64 编码后的字符除 了英文字母和数字外还有三个字符 + / =, 其中 = 只是为了补全编码后的字符数为 4 的整数,而 + 和 / 在一些情况下需要被替换的, b64encode 和 b64decode 正是提供了这样的功能。至于什么情况下 + 和 / 需要被替换,最常见的就是对 url 进行 base64 编码的时候。urlsafe_b64encode 和 urlsafe_b64decode 一组,这个就是用来专门对 url 进行 base64 编解码的,实际上也是调用的前一组函数。

通过上面函数可以导出一个map函数,接下来对map函数进行解析

因为导出数据中的时间参数为utc格式,需要将其转换为本地时间

'''
    将utc格式的时间转换为毫秒级时间戳
    @:param
        utc_time_str:utc格式时间
        utc_format:utc时间格式模板
'''
def utc2local(utc_time_str, utc_format='%Y-%m-%dT%H:%M:%S.%fZ'):
    if utc_time_str == None:
        return None
    local_tz = pytz.timezone('Asia/Shanghai')
    utc_dt = datetime.datetime.strptime(utc_time_str, utc_format)
    local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz)
    time_str = local_dt.strftime(LOCAL_FORMAT)
    return int(datetime.datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S.%f').timestamp()*1000)

将时间戳转换为本地时间

#因为是毫秒级,故需要/1000变为秒级
time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(startTimestamp/1000))

执行Mysql插入逻辑,将数据持久化到磁盘

'''
    向MySQL中插入数据
'''
def insert(queryId,statement,queryType,queryState,startTime,endTime,database,session_id,session_type,memory_per_node_peak_node,coordinator_hostId,oom,client_fetch_wait_time,memory_aggregate_peak,planning_wait_time,hdfs_bytes_read,query_duration):
     global db_connect
     #判断queryid是否存在
     select = sql_select % queryId
     insert = sql_insert % (
     queryId, statement.replace("\"","\'"), queryType, queryState, startTime, endTime, database, session_id, session_type,
     memory_per_node_peak_node, coordinator_hostId, oom, client_fetch_wait_time, memory_aggregate_peak,
     planning_wait_time, hdfs_bytes_read,query_duration)
     if db_connect == None:
         openMysqlConnect()
     cursor = db_connect.cursor()
     print(insert)
     execute = cursor.execute(select)
     if execute == 0:
        cursor.execute(insert)
     db_connect.commit()

此时数据已成功持久化,接下来可以通过mysql对数据进行分析操作。

main函数:

'''
    主函数
'''
if __name__ == '__main__':
    url = getCurrentUrl()
    data = exporyData(url)
    #查询正确信息
    queries = data['queries']
    #遍历信息,将数据格式化
    for list in  queries:
        startTime = list.get('startTime')
        endTime = list.get('endTime')
        #获取时间戳
        startTimestamp = utc2local(startTime)
        endTimestamp = utc2local(endTime)
        durationTime = getDurationTime(startTimestamp,endTimestamp)
        time.localtime(startTimestamp/1000)
        startTime = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(startTimestamp/1000))
        endTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(startTimestamp/1000))
        attributes_ = list['attributes']
        #插入语句
        insert(list.get('queryId'), list.get('statement'), list.get('queryType'), list.get('queryState'),
                   startTime, endTime, list.get('database'), attributes_.get('session_id'),
                   attributes_.get('session_type'), attributes_.get('memory_per_node_peak_node'),
                   list.get('coordinator').get('hostId'), attributes_.get('oom'), attributes_.get('client_fetch_wait_time'),
                   attributes_.get('memory_aggregate_peak'), attributes_.get('planning_wait_time'), attributes_.get('hdfs_bytes_read')
               ,durationTime)

全部的逻辑如下:

#encoding:utf-8
import urllib.request
import base64
import json
import time
import datetime
import pytz
import pymysql


user='admin'
password='admin'
# url = 'http://10.100.100.11:7180/api/v7/clusters/cluster2/services/impala/impalaQueries?from=2018-06-25T08%3A00%3A00' \
#       '.000Z&to=2018-06-25T09%3A00%3A00.000Z&filter= '
LOCAL_FORMAT = "%Y-%m-%d %H:%M:%S.%f"
LOCAL_FORMAT_ = "%Y-%m-%d %H:%M:%S"
#mysql连接参数
sql_host = '127.0.0.1'
sql_port = 3306
sql_user = 'root'
sql_pass = 'root'
sql_db = 'test'
db_connect = None

#sql语句
sql_insert = 'insert into impala_query_record values (\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\")'
sql_select = 'select * from impala_query_record where queryId = \"%s\"'


'''
    将utc格式的时间转换为时间戳
    @:param
        utc_time_str:utc格式时间
        utc_format:utc时间格式模板
'''
def utc2local(utc_time_str, utc_format='%Y-%m-%dT%H:%M:%S.%fZ'):
    if utc_time_str == None:
        return None
    local_tz = pytz.timezone('Asia/Shanghai')
    utc_dt = datetime.datetime.strptime(utc_time_str, utc_format)
    local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz)
    time_str = local_dt.strftime(LOCAL_FORMAT)
    return int(datetime.datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S.%f').timestamp()*1000)


'''
    将时间戳转换为utc格式的时间
     @:param
        local_ts:时间戳
        utc_format:utc时间格式模板
'''
def local_to_utc(local_ts, utc_format='%Y-%m-%dT%H:%M:%SZ'):
    local_tz = pytz.timezone('Asia/Shanghai')
    local_format = "%Y-%m-%d %H:%M:%S"
    time.localtime(local_ts)
    time_str = time.strftime(local_format, time.localtime(local_ts))
    dt = datetime.datetime.strptime(time_str, local_format)
    local_dt = local_tz.localize(dt, is_dst=None)
    utc_dt = local_dt.astimezone(pytz.utc)
    return utc_dt.strftime(utc_format)


'''
    动态生成url
'''
def getCurrentUrl():
    now_ = local_to_utc(time.time()).replace(':','%3A')
    before_ = local_to_utc((datetime.datetime.now()-datetime.timedelta(hours=1)).timestamp()).replace(':','%3A')
    return 'http://host:7180/api/v7/clusters/cluster2/services/impala/impalaQueries?from=' + before_ +'&to='+now_+'&filter= '


'''
    添加请求header并通过url导出json数据
'''
def exporyData(url):
    base64string = base64.encodestring(('%s:%s' % (user, password)).encode()).decode().replace('\n', '')
    request = urllib.request.Request(url)
    request.add_header("Authorization","Basic %s" % base64string)
    result = urllib.request.urlopen(request)
    html = result.read()
    result.close()
    data=json.loads(html.decode('utf-8'))
    return data



'''
    获取时间戳的差值
'''
def getDurationTime(start, end):
    if(start == None or end == None):
        return None
    return end - start

'''
    连接mysql
'''
def openMysqlConnect():
    global db_connect
    db_connect = pymysql.connect(host=sql_host, port=sql_port, user=sql_user, passwd=sql_pass, db=sql_db, charset='utf8')


'''
    向MySQL中插入数据
'''
def insert(queryId,statement,queryType,queryState,startTime,endTime,database,session_id,session_type,memory_per_node_peak_node,coordinator_hostId,oom,client_fetch_wait_time,memory_aggregate_peak,planning_wait_time,hdfs_bytes_read,query_duration):
     global db_connect
     #判断queryid是否存在
     select = sql_select % queryId
     insert = sql_insert % (
     queryId, statement.replace("\"","\'"), queryType, queryState, startTime, endTime, database, session_id, session_type,
     memory_per_node_peak_node, coordinator_hostId, oom, client_fetch_wait_time, memory_aggregate_peak,
     planning_wait_time, hdfs_bytes_read,query_duration)
     if db_connect == None:
         openMysqlConnect()
     cursor = db_connect.cursor()
     print(insert)
     execute = cursor.execute(select)
     if execute == 0:
        cursor.execute(insert)
     db_connect.commit()

'''
    主函数
'''
if __name__ == '__main__':
    url = getCurrentUrl()
    data = exporyData(url)
    #查询正确信息
    queries = data['queries']
    #遍历信息,将数据格式化
    for list in  queries:
        startTime = list.get('startTime')
        endTime = list.get('endTime')
        #获取时间戳
        startTimestamp = utc2local(startTime)
        endTimestamp = utc2local(endTime)
        durationTime = getDurationTime(startTimestamp,endTimestamp)
        time.localtime(startTimestamp/1000)
        startTime = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(startTimestamp/1000))
        endTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(startTimestamp/1000))
        attributes_ = list['attributes']
        #插入语句
        insert(list.get('queryId'), list.get('statement'), list.get('queryType'), list.get('queryState'),
                   startTime, endTime, list.get('database'), attributes_.get('session_id'),
                   attributes_.get('session_type'), attributes_.get('memory_per_node_peak_node'),
                   list.get('coordinator').get('hostId'), attributes_.get('oom'), attributes_.get('client_fetch_wait_time'),
                   attributes_.get('memory_aggregate_peak'), attributes_.get('planning_wait_time'), attributes_.get('hdfs_bytes_read')
               ,durationTime)
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值