python 处理 Impala 日志

最新推荐文章于 2021-11-21 10:38:27 发布

qq_34124060

最新推荐文章于 2021-11-21 10:38:27 发布

阅读量900

点赞数

分类专栏： python clouder manager Impala Mysql

本文链接：https://blog.csdn.net/qq_34124060/article/details/80818757

版权

python 同时被 3 个专栏收录

2 篇文章 0 订阅

订阅专栏

Mysql

2 篇文章 0 订阅

订阅专栏

clouder manager

1 篇文章 0 订阅

订阅专栏

使用到的Url为：
/clusters/{clusterName}/services/{serviceName}/impalaQueries

需要导入的函数库：

#encoding:utf-8
import urllib.request
import base64
import json
import time
import datetime
import pytz
import pymysql

用到的全局变量：

user='admin'
password='admin'
# url = 'http://10.100.100.11:7180/api/v7/clusters/cluster2/services/impala/impalaQueries?from=2018-06-25T08%3A00%3A00' \
#       '.000Z&to=2018-06-25T09%3A00%3A00.000Z&filter= '
LOCAL_FORMAT = "%Y-%m-%d %H:%M:%S.%f"
LOCAL_FORMAT_ = "%Y-%m-%d %H:%M:%S"
#mysql连接参数
sql_host = '127.0.0.1'
sql_port = 3306
sql_user = 'root'
sql_pass = 'root'
sql_db = 'test'
db_connect = None

#sql语句
sql_insert = 'insert into impala_query_record values (\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\")'
sql_select = 'select * from impala_query_record where queryId = \"%s\"'

对其进行初始化

'''
    动态生成url
'''
def getCurrentUrl():
    now_ = local_to_utc(time.time()).replace(':','%3A')
    before_ = local_to_utc((datetime.datetime.now()-datetime.timedelta(hours=1)).timestamp()).replace(':','%3A')
    return 'http://host:7180/api/v7/clusters/cluster2/services/impala/impalaQueries?from=' + before_ +'&to='+now_+'&filter= '

local_to_utc:将时间戳转换为utc格式（url中的请求时间必须为utc格式）

'''
    将时间戳转换为utc格式的时间
     @:param
        local_ts:时间戳
        utc_format:utc时间格式模板
'''
def local_to_utc(local_ts, utc_format='%Y-%m-%dT%H:%M:%SZ'):
    local_tz = pytz.timezone('Asia/Shanghai')
    local_format = "%Y-%m-%d %H:%M:%S"
    time.localtime(local_ts)
    time_str = time.strftime(local_format, time.localtime(local_ts))
    dt = datetime.datetime.strptime(time_str, local_format)
    local_dt = local_tz.localize(dt, is_dst=None)
    utc_dt = local_dt.astimezone(pytz.utc)
    return utc_dt.strftime(utc_format)

得到url后可以发起http请求获取导出的数据:

'''
    添加请求header并通过url导出json数据
'''
def exporyData(url):
    //根据username 和 password 生成 base64string
    base64string = base64.encodestring(('%s:%s' % (user, password)).encode()).decode().replace('\n', '')
    request = urllib.request.Request(url)
    //将base64string添加到http请求头中
    request.add_header("Authorization","Basic %s" % base64string)
    result = urllib.request.urlopen(request)
    html = result.read()
    result.close()
    data=json.loads(html.decode('utf-8'))
    return data

这里需要着重理解的是 base64 ，因为该 API 使用了 http 基本认证，需要在 Authorization header 放入基于 base64 加密的用户名和密码，中间用“：”连接，服务器将 Authorization header 中的用户名密码取出，进行验证，如果验证通过，将根据请求，发送资源给客户端。

base64 模块真正用的上的方法只有8个，分别是 encode, decode, encodestring, decodestring, b64encode,b64decode, urlsafe_b64decode,urlsafe_b64encode。他们8个可以两两分为4组，encode, decode 一组，专门用来编码和解码文件的,也可以对 StringIO 里的数据做编解码；encodestring,decodestring 一组，专门用来编码和解码字符串； b64encode和b64decode 一组，用来编码和解码字符串，并且有一个替换符号字符的功能。这个功能是这样的：因为 base64 编码后的字符除了英文字母和数字外还有三个字符 + / =, 其中 = 只是为了补全编码后的字符数为 4 的整数，而 + 和 / 在一些情况下需要被替换的， b64encode 和 b64decode 正是提供了这样的功能。至于什么情况下 + 和 / 需要被替换，最常见的就是对 url 进行 base64 编码的时候。urlsafe_b64encode 和 urlsafe_b64decode 一组，这个就是用来专门对 url 进行 base64 编解码的，实际上也是调用的前一组函数。

通过上面函数可以导出一个map函数，接下来对map函数进行解析

因为导出数据中的时间参数为utc格式，需要将其转换为本地时间

'''
    将utc格式的时间转换为毫秒级时间戳
    @:param
        utc_time_str:utc格式时间
        utc_format:utc时间格式模板
'''
def utc2local(utc_time_str, utc_format='%Y-%m-%dT%H:%M:%S.%fZ'):
    if utc_time_str == None:
        return None
    local_tz = pytz.timezone('Asia/Shanghai')
    utc_dt = datetime.datetime.strptime(utc_time_str, utc_format)
    local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz)
    time_str = local_dt.strftime(LOCAL_FORMAT)
    return int(datetime.datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S.%f').timestamp()*1000)

将时间戳转换为本地时间

#因为是毫秒级，故需要/1000变为秒级
time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(startTimestamp/1000))

执行Mysql插入逻辑，将数据持久化到磁盘

'''
    向MySQL中插入数据
'''
def insert(queryId,statement,queryType,queryState,startTime,endTime,database,session_id,session_type,memory_per_node_peak_node,coordinator_hostId,oom,client_fetch_wait_time,memory_aggregate_peak,planning_wait_time,hdfs_bytes_read,query_duration):
     global db_connect
     #判断queryid是否存在
     select = sql_select % queryId
     insert = sql_insert % (
     queryId, statement.replace("\"","\'"), queryType, queryState, startTime, endTime, database, session_id, session_type,
     memory_per_node_peak_node, coordinator_hostId, oom, client_fetch_wait_time, memory_aggregate_peak,
     planning_wait_time, hdfs_bytes_read,query_duration)
     if db_connect == None:
         openMysqlConnect()
     cursor = db_connect.cursor()
     print(insert)
     execute = cursor.execute(select)
     if execute == 0:
        cursor.execute(insert)
     db_connect.commit()

此时数据已成功持久化，接下来可以通过mysql对数据进行分析操作。

main函数：

'''
    主函数
'''
if __name__ == '__main__':
    url = getCurrentUrl()
    data = exporyData(url)
    #查询正确信息
    queries = data['queries']
    #遍历信息，将数据格式化
    for list in  queries:
        startTime = list.get('startTime')
        endTime = list.get('endTime')
        #获取时间戳
        startTimestamp = utc2local(startTime)
        endTimestamp = utc2local(endTime)
        durationTime = getDurationTime(startTimestamp,endTimestamp)
        time.localtime(startTimestamp/1000)
        startTime = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(startTimestamp/1000))
        endTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(startTimestamp/1000))
        attributes_ = list['attributes']
        #插入语句
        insert(list.get('queryId'), list.get('statement'), list.get('queryType'), list.get('queryState'),
                   startTime, endTime, list.get('database'), attributes_.get('session_id'),
                   attributes_.get('session_type'), attributes_.get('memory_per_node_peak_node'),
                   list.get('coordinator').get('hostId'), attributes_.get('oom'), attributes_.get('client_fetch_wait_time'),
                   attributes_.get('memory_aggregate_peak'), attributes_.get('planning_wait_time'), attributes_.get('hdfs_bytes_read')
               ,durationTime)

全部的逻辑如下：

#encoding:utf-8
import urllib.request
import base64
import json
import time
import datetime
import pytz
import pymysql


user='admin'
password='admin'
# url = 'http://10.100.100.11:7180/api/v7/clusters/cluster2/services/impala/impalaQueries?from=2018-06-25T08%3A00%3A00' \
#       '.000Z&to=2018-06-25T09%3A00%3A00.000Z&filter= '
LOCAL_FORMAT = "%Y-%m-%d %H:%M:%S.%f"
LOCAL_FORMAT_ = "%Y-%m-%d %H:%M:%S"
#mysql连接参数
sql_host = '127.0.0.1'
sql_port = 3306
sql_user = 'root'
sql_pass = 'root'
sql_db = 'test'
db_connect = None

#sql语句
sql_insert = 'insert into impala_query_record values (\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",\"%s\")'
sql_select = 'select * from impala_query_record where queryId = \"%s\"'


'''
    将utc格式的时间转换为时间戳
    @:param
        utc_time_str:utc格式时间
        utc_format:utc时间格式模板
'''
def utc2local(utc_time_str, utc_format='%Y-%m-%dT%H:%M:%S.%fZ'):
    if utc_time_str == None:
        return None
    local_tz = pytz.timezone('Asia/Shanghai')
    utc_dt = datetime.datetime.strptime(utc_time_str, utc_format)
    local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz)
    time_str = local_dt.strftime(LOCAL_FORMAT)
    return int(datetime.datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S.%f').timestamp()*1000)


'''
    将时间戳转换为utc格式的时间
     @:param
        local_ts:时间戳
        utc_format:utc时间格式模板
'''
def local_to_utc(local_ts, utc_format='%Y-%m-%dT%H:%M:%SZ'):
    local_tz = pytz.timezone('Asia/Shanghai')
    local_format = "%Y-%m-%d %H:%M:%S"
    time.localtime(local_ts)
    time_str = time.strftime(local_format, time.localtime(local_ts))
    dt = datetime.datetime.strptime(time_str, local_format)
    local_dt = local_tz.localize(dt, is_dst=None)
    utc_dt = local_dt.astimezone(pytz.utc)
    return utc_dt.strftime(utc_format)


'''
    动态生成url
'''
def getCurrentUrl():
    now_ = local_to_utc(time.time()).replace(':','%3A')
    before_ = local_to_utc((datetime.datetime.now()-datetime.timedelta(hours=1)).timestamp()).replace(':','%3A')
    return 'http://host:7180/api/v7/clusters/cluster2/services/impala/impalaQueries?from=' + before_ +'&to='+now_+'&filter= '


'''
    添加请求header并通过url导出json数据
'''
def exporyData(url):
    base64string = base64.encodestring(('%s:%s' % (user, password)).encode()).decode().replace('\n', '')
    request = urllib.request.Request(url)
    request.add_header("Authorization","Basic %s" % base64string)
    result = urllib.request.urlopen(request)
    html = result.read()
    result.close()
    data=json.loads(html.decode('utf-8'))
    return data



'''
    获取时间戳的差值
'''
def getDurationTime(start, end):
    if(start == None or end == None):
        return None
    return end - start

'''
    连接mysql
'''
def openMysqlConnect():
    global db_connect
    db_connect = pymysql.connect(host=sql_host, port=sql_port, user=sql_user, passwd=sql_pass, db=sql_db, charset='utf8')


'''
    向MySQL中插入数据
'''
def insert(queryId,statement,queryType,queryState,startTime,endTime,database,session_id,session_type,memory_per_node_peak_node,coordinator_hostId,oom,client_fetch_wait_time,memory_aggregate_peak,planning_wait_time,hdfs_bytes_read,query_duration):
     global db_connect
     #判断queryid是否存在
     select = sql_select % queryId
     insert = sql_insert % (
     queryId, statement.replace("\"","\'"), queryType, queryState, startTime, endTime, database, session_id, session_type,
     memory_per_node_peak_node, coordinator_hostId, oom, client_fetch_wait_time, memory_aggregate_peak,
     planning_wait_time, hdfs_bytes_read,query_duration)
     if db_connect == None:
         openMysqlConnect()
     cursor = db_connect.cursor()
     print(insert)
     execute = cursor.execute(select)
     if execute == 0:
        cursor.execute(insert)
     db_connect.commit()

'''
    主函数
'''
if __name__ == '__main__':
    url = getCurrentUrl()
    data = exporyData(url)
    #查询正确信息
    queries = data['queries']
    #遍历信息，将数据格式化
    for list in  queries:
        startTime = list.get('startTime')
        endTime = list.get('endTime')
        #获取时间戳
        startTimestamp = utc2local(startTime)
        endTimestamp = utc2local(endTime)
        durationTime = getDurationTime(startTimestamp,endTimestamp)
        time.localtime(startTimestamp/1000)
        startTime = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(startTimestamp/1000))
        endTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(startTimestamp/1000))
        attributes_ = list['attributes']
        #插入语句
        insert(list.get('queryId'), list.get('statement'), list.get('queryType'), list.get('queryState'),
                   startTime, endTime, list.get('database'), attributes_.get('session_id'),
                   attributes_.get('session_type'), attributes_.get('memory_per_node_peak_node'),
                   list.get('coordinator').get('hostId'), attributes_.get('oom'), attributes_.get('client_fetch_wait_time'),
                   attributes_.get('memory_aggregate_peak'), attributes_.get('planning_wait_time'), attributes_.get('hdfs_bytes_read')
               ,durationTime)

qq_34124060

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
python 处理 Impala 日志

使用到的Url为： /clusters/{clusterName}/services/{serviceName}/impalaQueries需要导入的函数库：#encoding:utf-8import urllib.requestimport base64import jsonimport timeimport datetimeimport pytzimport py...
复制链接

扫一扫