python脚本

l952766

已于 2023-03-14 19:50:25 修改

阅读量69

点赞数

文章标签： python 大数据 hive

于 2023-03-14 18:03:30 首次发布

本文链接：https://blog.csdn.net/l952766/article/details/129535656

版权

@[TOC]`import csv
import json
import os
import sys
import requests
from datetime import datetime,timedelta
from urllib import request as req
import boto3
import airflow
from airflow.models import DAG
from datetime import datetime,timedelta
from airflow.operators.dummy_operator import DummyOperator
from airflow.operators.python_operator import PythonOperator
from airflow.operators.hive_operator import HiveOperator
from airflow.utils import dates

------------------------ 脚本位置 -------------------

/home/super/job/presto_query_data

emrs = [
(‘c-f3e01a03022153cd’,‘bigdata-adhoc-presto’,‘10.38.85.141’),
]
args = {
‘owner’: ‘rui.zhao’,
‘is_show’: 0,
‘start_date’: dates.days_ago(1),
}
presto_query_data_dag = DAG(
dag_id=‘presto_query_data’,
default_args=args,
schedule_interval=‘0 /1 * * * * ',
dagrun_timeout=timedelta(minutes=60),
catchup=False
)
dummy_start = DummyOperator(task_id=‘start’, dag=presto_query_data_dag)
dummy_finish = DummyOperator(task_id=‘finish’, dag=presto_query_data_dag)
def change(bytes):
if bytes.endswith(‘GB’):
return float(bytes.split(“GB”)[0])102410241024
elif bytes.endswith(‘kB’):
return float(bytes.split(“kB”)[0])1024
elif bytes.endswith(‘TB’):
return float(bytes.split(“TB”)[0])102410241024*1024
elif bytes.endswith(‘MB’):
return float(bytes.split(“MB”)[0])10241024
else:
return format(float(bytes.split(“B”)[0]),’.3f’)
def change_time(ms):
if ms.endswith(‘h’):
return format(float(ms.split(“h”)[0])*3600,‘.2f’)
elif ms.endswith(‘m’,1):
return format(float(ms.split(“m”)[0])*60,‘.2f’)
elif ms.endswith(‘ms’):
return format(float(ms.split(“ms”)[0])/1000,‘.2f’)
elif ms.endswith(‘us’):
return format(float(ms.split(“us”)[0])/1000/1000,‘.2f’)
elif ms.endswith(‘ns’,1):
return float(ms.split(“n”)[0])
elif ms.endswith(‘d’,1):
return float(ms.split(“d”)[0])
else:
return float(ms.split(“s”)[0])
def get_data(data):
execution_date = datetime.fromisoformat(data)
hour_time = datetime.now()
hour = datetime.strftime(hour_time, ‘%Y%m%d%H’)
dt = datetime.strftime(hour_time, ‘%Y%m%d’)
path_file = “/tmp/presto-”+hour+“.csv”
csvfile = open(path_file, mode=‘a’,encoding=‘utf-8’,newline=‘\n’)
writer= csv.DictWriter(csvfile,delimiter=‘\u0001’,fieldnames=[‘emr_id’,‘emr_name’,‘presto_host’,‘queryId’,‘clientTransactionSupport’,‘user’,‘source’,‘catalog’,‘schema’,‘timeZoneKey’,‘locale’,‘remoteUserAddress’,‘userAgent’,‘resourceGroupId’,‘state’,‘memoryPool’,‘scheduled’,‘self’,‘query’,‘createTime’,‘endTime’,‘queuedTime’,‘elapsedTime’,‘executionTime’,‘totalDrivers’,‘queuedDrivers’,‘runningDrivers’,‘completedDrivers’,‘rawInputDataSize’,‘rawInputPositions’,‘cumulativeUserMemory’,‘userMemoryReservation’,‘totalMemoryReservation’,‘peakUserMemoryReservation’,‘totalCpuTime’,‘totalScheduledTime’,‘fullyBlocked’,‘progressPercentage’])
for emr_id,emr_name,presto_host in emrs:
url = f"http://“+presto_host+”:8889/v1/query"
get_data = req.urlopen(url, timeout=600).read().decode(‘utf-8’)
resp = json.loads(get_data)
for x in range(len(resp)):
if ‘schema’ in resp[x][‘session’]:
schema = resp[x][‘session’][‘schema’]
else :
schema = ‘NULL’
if ‘progressPercentage’ in resp[x][‘queryStats’]:
progressPercentage = resp[x][‘queryStats’][‘progressPercentage’]
else :
progressPercentage = 0.0
if ‘resourceGroupId’ in resp[x]:
resourceGroupId = resp[x][‘resourceGroupId’][0]
else :
resourceGroupId = ‘无数据’
if ‘endTime’ in resp[x][‘queryStats’]:
endTime = resp[x][‘queryStats’][‘endTime’]
else :
endTime = ‘NULL’
writer.writerow({
‘emr_id’:emr_id,
‘emr_name’:emr_name,
‘presto_host’:presto_host,
‘queryId’:resp[x][‘session’][‘queryId’],
‘clientTransactionSupport’:resp[x][‘session’][‘clientTransactionSupport’],
‘user’:resp[x][‘session’][‘user’],
‘source’:resp[x][‘session’][‘source’],
‘catalog’:resp[x][‘session’][‘catalog’],
‘schema’:schema,
‘timeZoneKey’:resp[x][‘session’][‘timeZoneKey’],
‘locale’:resp[x][‘session’][‘locale’],
‘remoteUserAddress’:resp[x][‘session’][‘remoteUserAddress’],
‘userAgent’:resp[x][‘session’][‘userAgent’],
‘resourceGroupId’:resourceGroupId,
‘state’:resp[x][‘state’],
‘memoryPool’:resp[x][‘memoryPool’],
‘scheduled’:resp[x][‘scheduled’],
‘self’:resp[x][‘self’],
‘query’:resp[x][‘query’].replace(‘\n’, ’ ‘),
‘createTime’:resp[x][‘queryStats’][‘createTime’],
#‘endTime’:resp[x][‘queryStats’][‘endTime’],
‘endTime’:endTime,
‘queuedTime’:resp[x][‘queryStats’][‘queuedTime’],
‘elapsedTime’:resp[x][‘queryStats’][‘elapsedTime’],
‘executionTime’:change_time(resp[x][‘queryStats’][‘executionTime’]),
‘totalDrivers’:int(resp[x][‘queryStats’][‘totalDrivers’]),
‘queuedDrivers’:int(resp[x][‘queryStats’][‘queuedDrivers’]),
‘runningDrivers’:int(resp[x][‘queryStats’][‘runningDrivers’]),
‘completedDrivers’:int(resp[x][‘queryStats’][‘completedDrivers’]),
‘rawInputDataSize’:change(resp[x][‘queryStats’][‘rawInputDataSize’]),
‘rawInputPositions’:int(resp[x][‘queryStats’][‘rawInputPositions’]),
‘cumulativeUserMemory’:int(resp[x][‘queryStats’][‘cumulativeUserMemory’]),
‘userMemoryReservation’:change(resp[x][‘queryStats’][‘userMemoryReservation’]),
‘totalMemoryReservation’:change(resp[x][‘queryStats’][‘totalMemoryReservation’]),
‘peakUserMemoryReservation’:change(resp[x][‘queryStats’][‘peakUserMemoryReservation’]),
‘totalCpuTime’:resp[x][‘queryStats’][‘totalCpuTime’],
‘totalScheduledTime’:resp[x][‘queryStats’][‘totalScheduledTime’],
‘fullyBlocked’:resp[x][‘queryStats’][‘fullyBlocked’],
‘progressPercentage’:progressPercentage
})
csvfile.close()
boto3.resource(‘s3’)
.Object(‘transsion-datacenter-ire’, f’tranadm/presto_query_data/dt={dt}/{hour}.csv’)
.upload_file(path_file)
os.remove(path_file)
get_all_apps_task = PythonOperator(
task_id=‘get_data_task’,
python_callable=get_data,
dag=presto_query_data_dag,
op_kwargs={‘data’: ‘{{ execution_date }}’}
)
add_partition = HiveOperator(
task_id=“add_partition”,
hql=“alter table presto_query_data add if not exists partition (dt=‘{{ds_nodash}}’);”,
schema=“tranadm”,
dag=presto_query_data_dag
)
dummy_start >> get_all_apps_task >> add_partition >> dummy_finish`