spark集群执行python demo

spark集群通常用于处理大数据,通常涉及到任务执行的先后顺序,所以前面有DAG

DAG文件

# DAG示例
# -*- coding: utf-8 -*-
from airflow import utils
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from datetime import timedelta
from airflow.operators.sensors import ExternalTaskSensor
from airflow import configuration
import datetime, time
import os
import imp


DAG_NAME = 'scc_check_ads_realtime_media'  # DAG名称,需要修改以本人首字母开头
# BIZ_DATE = '{{ ds }}'  # 宏,与airflow相同,表示调度执行当天

default_args = {
    'owner': 'name',
    'depends_on_past': False,
    'start_date': datetime.datetime.strptime("2021-10-18 16:00:00", "%Y-%m-%d %H:%M:%S"),  # 开始执行的日期
    'retries': 3,
    'email': [],
    'email_on_failure': True,  # 执行失败时会邮件通知
    'email_on_retry': True,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG(DAG_NAME, catchup=False, schedule_interval='09 10 * * *', default_args=default_args)

# 这个地方是对应的bash 文件
etlCommand = "bash ../qa_execute_realtime_media_check.sh {{ ts }}"

wait_load_doris = ExternalTaskSensor(
    task_id='wait_load_doris',
    external_dag_id='etl_metrics',
    external_task_id='T_app_i_m',
    execution_delta=timedelta(hours=-1),  # 自己dag的执行时间-依赖dag的执行时间(询问DE同学)
    dag=dag)

check_ads_realtime_outer = BashOperator(
    task_id='check_ads_realtime_outer',
    execution_timeout=timedelta(minutes=20),
    dag=dag,
    bash_command=etlCommand)


wait_load_doris >> check_ads_realtime_outer

qa_execute_realtime_media_check.sh文件

!/usr/bin/env bash

source /etc/profile
export HADOOP_USER_NAME=xxx   # 组账号,在这儿提交任务,组账号才能访问,日志也在对应目录下

set -o errexit
set -o nounset
# event_time 为执行shell文本
if [ $# == 0 ];then
    event_time=`date  +"%Y-%m-%d" -d  "-1 days"`
else
    event_time=$1
fi

bizdate=`echo $event_time  | awk -F 'T|:' '{print $1}'`
execute_hour=`echo $event_time | awk -F 'T|:' '{print $2}'`


echo $execute_hour
#export SPARK_HOME=/data/software/app/spark-2.2


spark-submit --master yarn \
    --deploy-mode cluster \
    --name qa_check_realtime_media_${execute_hour}   --conf spark.driver.maxResultSize=4096m \
    --conf spark.sql.adaptive.enabled=false \
    --conf spark.executor.cores=4 \
    --conf spark.dynamicAllocation.enabled=true \
    --conf spark.dynamicAllocation.maxExecutors=200 \
    --conf spark.dynamicAllocation.minExecutors=3  \
    --queue datalake-de-priority  \
    --files /data/software/app-datalake/apache-hive-1.2.2-bin/conf/hive-site.xml   \
    qa_check_fs_media.py  $execute_hour

qa_check_fs_media.py文件

# -*- coding: utf-8 -*-
from pyspark.sql.session import SparkSession


def get_data_from_hive(spark, hour):
    sql_str = """
    select ad_channel,ad_account_id,stat_hour,sum(cost)/10000 as costs,sum(show) as shows,sum(click) as clicks,(case cast(sum(download) as string) when 'None' Then 0 else sum(download) end) as downloads from {table} where date(dt)=current_date and stat_hour < {hour}
    group by  ad_channel,ad_account_id,stat_hour
    order by ad_channel,ad_account_id,cast(stat_hour as int)
    """.format(table='table_name',hour=hour)
    df = spark.sql(sql_str)
    results = df.rdd.map(lambda x: [str(x[0]) + '_' + str(x[1]) + '_' + str(x[2]), [x[3:]]]).collect()
    rows_map = {}
    for i in results:
        if i[0] not in rows_map:
            rows_map[i[0]] = list(i[1][0])
        else:
            print('hive表数据错误,同一个key有两个值')
    print(rows_map)
    return rows_map

if __name__ == "__main__":
	hour = sys.argv[1] # shell文件的一个参数,即execute_hour
	spark = SparkSession.Builder().appName("qa_check_realtime_media").getOrCreate() 
    get_data_from_hive(spark, hour) 
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值