使用airflow从datahub获取数据到DB,修改数据,打包csv档发送minio,task执行成功邮件通知复写等
import smtplib
import pendulum
from email.mime.text import MIMEText
from email.header import Header
from airflow import DAG
from airflow.utils.dates import days_ago
from datahub.operators.datahub_to_rdb import DatahubToRDBOperator
from datetime import datetime, timedelta
from airflow.operators.python import PythonOperator
from airflow.operators.dummy import DummyOperator
from airflow.providers.mysql.operators.mysql import MySqlOperator
import pandas as pd
import boto3
from airflow.hooks.mysql_hook import MySqlHook
import tempfile
from airflow.exceptions import AirflowException
import math
def readdata():
mysql_hook=MySqlHook(mysql_conn_id="数据库连接id")
sqlalchemy_engine=mysql_hook.get_sqlalchemy_engine();
try:
df=pd.read_sql("SELECT * FROM employeeinfo ",sqlalchemy_engine)
print(df.head())
print('--------------')
print(df.info())
df.to_csv("test.csv",encoding='utf_8_sig',index=False,)
s3_resource = boto3.resource('s3',
endpoint_url='minio的网址',
aws_access_key_id='minio账号',
aws_secret_access_key='minio密码',
region_name='us-east-1')
my_object_key_01 = "test1.csv"
s3_resource.Object('uploadfiles', my_object_key_01).upload_file(
'test.csv')
except Exception as e:
raise AirflowException(f"read table to dataframe fail:{e}")
def send_email(mail_msg):
mail_host = "邮件服务器"
sender = 'airflow@mail.com'
receivers = ['xxxx@mail.com']
message = MIMEText(mail_msg, 'html', 'utf-8')
message['From'] = Header("airflow")
message['To'] = Header("xxxx")
subject = 'Airflow 任务告警'
message['Subject'] = Header(subject, 'utf-8')
try:
smtp_obj = smtplib.SMTP()
smtp_obj.connect(mail_host, 25)
smtp_obj.sendmail(sender, receivers, message.as_string())
except smtplib.SMTPException:
print("Error: 无法发送邮件")
def send_email_fun(msg):
send_email(msg)
def success_callback(context):
dag_id = context['dag'].dag_id
task_id = context['task_instance'].task_id
current_timestamp=math.trunc(datetime.now().timestamp()+28800)
current_time=datetime.fromtimestamp(current_timestamp)
mail_msg = f"""
<p>日期:{current_time}</p>
<p>DAG ID:{dag_id}...</p>
<p>Task id:{task_id}...</p>
<P>SUCCESS</P>
"""
send_email_fun(mail_msg)
sqls="""
update employeeinfo SET deptid='xxx'
WHERE emplid='xxxx';
"""
local_tz = pendulum.timezone("Asia/Shanghai")
start_date = datetime(2022, 7, 14, 15, 55, 19, tzinfo=local_tz)
default_args = {
'owner':'xxxx',
'email_on_failure': True,
'email': ['xxxx@mail.com'],
'start_date': start_date,
'on_success_callback': success_callback,
}
dag = DAG(
dag_id='dag的id,自定义',
schedule_interval=None,
start_date=days_ago(2),
catchup=False,
default_args=default_args,
access_control={
'租户id': {'can_read', 'can_edit'}
},
tags=['test'])
con_whq_s3tohub_test_task = DatahubToRDBOperator(
task_id='employee_test',
datahub_conn_id='datahub的id',
datahub_topic='订阅的topic',
rdb_type='mysql',
rdb_conn_id='数据库连接id',
table_name='employeeinfo',
dag=dag)
sql_update = MySqlOperator(
task_id='sql_update',
mysql_conn_id='数据库连接id',
sql=sqls,
autocommit=True,
dag=dag,
)
def print_success(**context):
print("Mail is send success")
success_callback=PythonOperator(
task_id='send_email',
python_callable=print_success,
provide_context=True,
dag=dag,
)
getdata=PythonOperator(
task_id='getdata',
python_callable=readdata,
dag=dag,
)
con_whq_s3tohub_test_task >> sql_update >> getdata >> success_callback