spark集群通常用于处理大数据,通常涉及到任务执行的先后顺序,所以前面有DAG
DAG文件
# DAG示例
# -*- coding: utf-8 -*-
from airflow import utils
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from datetime import timedelta
from airflow.operators.sensors import ExternalTaskSensor
from airflow import configuration
import datetime, time
import os
import imp
DAG_NAME = 'scc_check_ads_realtime_media' # DAG名称,需要修改以本人首字母开头
# BIZ_DATE = '{{ ds }}' # 宏,与airflow相同,表示调度执行当天
default_args = {
'owner': 'name',
'depends_on_past': False,
'start_date': datetime.datetime.strptime("2021-10-18 16:00:00", "%Y-%m-%d %H:%M:%S"), # 开始执行的日期
'retries': 3,
'email': [],
'email_on_failure': True, # 执行失败时会邮件通知
'email_on_retry': True,
'retry_delay': timedelta(minutes=5),
}
dag = DAG(DAG_NAME, catchup=False, schedule_interval='09 10 * * *', default_args=default_args)
# 这个地方是对应的bash 文件
etlCommand = "bash ../qa_execute_realtime_media_check.sh {{ ts }}"
wait_load_doris = ExternalTaskSensor(
task_id='wait_load_doris',
external_dag_id='etl_metrics',
external_task_id='T_app_i_m',
execution_delta=timedelta(hours=-1), # 自己dag的执行时间-依赖dag的执行时间(询问DE同学)
dag=dag)
check_ads_realtime_outer = BashOperator(
task_id='check_ads_realtime_outer',
execution_timeout=timedelta(minutes=20),
dag=dag,
bash_command=etlCommand)
wait_load_doris >> check_ads_realtime_outer
qa_execute_realtime_media_check.sh文件
!/usr/bin/env bash
source /etc/profile
export HADOOP_USER_NAME=xxx # 组账号,在这儿提交任务,组账号才能访问,日志也在对应目录下
set -o errexit
set -o nounset
# event_time 为执行shell文本
if [ $# == 0 ];then
event_time=`date +"%Y-%m-%d" -d "-1 days"`
else
event_time=$1
fi
bizdate=`echo $event_time | awk -F 'T|:' '{print $1}'`
execute_hour=`echo $event_time | awk -F 'T|:' '{print $2}'`
echo $execute_hour
#export SPARK_HOME=/data/software/app/spark-2.2
spark-submit --master yarn \
--deploy-mode cluster \
--name qa_check_realtime_media_${execute_hour} --conf spark.driver.maxResultSize=4096m \
--conf spark.sql.adaptive.enabled=false \
--conf spark.executor.cores=4 \
--conf spark.dynamicAllocation.enabled=true \
--conf spark.dynamicAllocation.maxExecutors=200 \
--conf spark.dynamicAllocation.minExecutors=3 \
--queue datalake-de-priority \
--files /data/software/app-datalake/apache-hive-1.2.2-bin/conf/hive-site.xml \
qa_check_fs_media.py $execute_hour
qa_check_fs_media.py文件
# -*- coding: utf-8 -*-
from pyspark.sql.session import SparkSession
def get_data_from_hive(spark, hour):
sql_str = """
select ad_channel,ad_account_id,stat_hour,sum(cost)/10000 as costs,sum(show) as shows,sum(click) as clicks,(case cast(sum(download) as string) when 'None' Then 0 else sum(download) end) as downloads from {table} where date(dt)=current_date and stat_hour < {hour}
group by ad_channel,ad_account_id,stat_hour
order by ad_channel,ad_account_id,cast(stat_hour as int)
""".format(table='table_name',hour=hour)
df = spark.sql(sql_str)
results = df.rdd.map(lambda x: [str(x[0]) + '_' + str(x[1]) + '_' + str(x[2]), [x[3:]]]).collect()
rows_map = {}
for i in results:
if i[0] not in rows_map:
rows_map[i[0]] = list(i[1][0])
else:
print('hive表数据错误,同一个key有两个值')
print(rows_map)
return rows_map
if __name__ == "__main__":
hour = sys.argv[1] # shell文件的一个参数,即execute_hour
spark = SparkSession.Builder().appName("qa_check_realtime_media").getOrCreate()
get_data_from_hive(spark, hour)