linux环境下用python读取hive数据并写入到excel(定时调度、自定义列宽)

功能点:
1、调用自定义类,实现对excel格式的调整

  • 内嵌定时功能,实现定时执行
  • 可对标题等进行颜色标注
  • 可实现自定一列宽
  • 可实现数据居左、居右、据中对齐方式
  • 并将生成的excel通过邮件发送
# -*- coding:utf-8 -*-
"""
定时任务
多线程,每日执行
"""
from pyhive import hive
import pandas as pd
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from email.header import Header
from datetime import datetime
from dateutil.relativedelta import relativedelta
from functools import wraps
import time
import os
from multiprocessing import Process
import openpyxl
from openpyxl.styles import Alignment
from openpyxl.styles import PatternFill
from openpyxl.styles import Font
from openpyxl import Workbook, load_workbook
from os import remove, path

def get_delta_time(mode, times):
    """
    :param mode: 三种模式:daily, monthly, weekly
    :param times:
        当mode为daily时,times: [hour, minute, second];mode为monthly时,times: [monthday, hour, minute, second],mode为weekly时,times: [weekday, hour, minute, second];
        其中weekday为周一~周天: 1,2,3,4,5,6,0。
    :return:最近的执行时间,倒计时
    """
    current_time = datetime.now()
    if mode == "daily":
        # 如果目标时间晚于当前时间
        time_str = "{} {}:{}:{}".format(current_time.date(), *times)
        target_time = datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S')
        if target_time < current_time:
            # 如果目标时间早于当前时间,获取下一天的时间
            next_time = (current_time + relativedelta(days=1))
            time_str = "{} {}:{}:{}".format(next_time.date(), *times)
            target_time = datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S')
    elif mode == "monthly":
        time_str = '{}-{}-{} {}:{}:{}'.format(current_time.year, current_time.month, *times)
        target_time = datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S')
        if target_time < current_time:
            next_time = (current_time + relativedelta(months=1))
            time_str = '{}-{}-{} {}:{}:{}'.format(next_time.year, next_time.month, *times)
            target_time = datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S')
    elif mode == "weekly":
        time_str = '{}-{}-{} {}:{}:{}'.format(current_time.year, current_time.strftime('%W'), *times)
        target_time = datetime.strptime(time_str, '%Y-%W-%w %H:%M:%S')
        if target_time < current_time:
            next_time = (current_time + relativedelta(weeks=1))
            time_str = '{}-{}-{} {}:{}:{}'.format(next_time.year, next_time.strftime('%W'), *times)
            target_time = datetime.strptime(time_str, '%Y-%W-%w %H:%M:%S')
    time_delta = (target_time - current_time).total_seconds()  # 时间差
    return target_time, time_delta


def days_schedule(mode, times):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            while True:
                target_time, time_delta = get_delta_time(mode, times)
                print("{}下一次执行时间: {}".format(func.__name__, target_time))
                time.sleep(time_delta)
                func(*args, **kwargs)
            return 1
        return wrapper
    return decorator

class XlsxSaver:
    """
    一个将DataFrame转换成格式化excel的工具
    """
    def __init__(self, df_in, filename='a.xlsx', sheet_name='Sheet1'):
        """
        df_in : 从一个DataFrame对象获取表格内容
        filename : 文件名
        sheet_name : 表名
        """
        self.filename = filename  # 保存的xlsx文件的名字
        self.user_def = []  # 储存由用户自定义的列的列名,这些列不再参与自动计算列宽
        if path.exists(filename):
            # 如果文件存在,就直接打开,添加Sheet
            self.wb = load_workbook(filename)
            self.sheet = self.wb.create_sheet(sheet_name)
        else:
            # 如果文件不存在,就创建表格
            self.wb = Workbook()
            self.sheet = self.wb.active
            self.sheet.title = sheet_name
        # 将df的内容复制给sheet
        self.df = df_in.copy()
        self.sheet.append(list(self.df.columns))
        for row in range(0, len(list(self.df.index))):
            for col in range(0, len(list(self.df.columns))):
                self.sheet.cell(row+2, col+1).value = self.df.iloc[row, col]  # 注意:sheet行列从1开始计数

    def remove_file(self):
        remove(self.filename)

    def set_sheet_name(self, sheet_name):
        self.sheet.title = sheet_name

    def set_filename(self, filename):
        self.filename = filename

    def get_maxlength(self, series_in, col):
        """
        获取一个类型为object的Series中的最大占位长度,用于确定导出的xlsx文件的列宽
        col : 表头,也参与比较,解决有时候表头过长的问题
        """
        series = series_in.fillna('-')  # 填充空值,防止出现nan
        str_list = list(series)
        len_list = []
        for elem in str_list + [col]:
            elem_split = list(elem)
            length = 0
            for c in elem_split:
                if ord(c) <= 256:
                    length += 1
                else:
                    length += 2
            len_list.append(length)
        return max(len_list)

    def __auto_width(self):
        cols_list = list(self.df.columns)  # 获取列名
        for i in range(0, len(cols_list)):
            col = cols_list[i]
            if col in self.user_def:
                continue
            self.sheet.cell(1, i+1).font = Font(bold=True)  # 加粗表头
            letter = chr(i+65)  # 由ASCII值获得对应的列字母
            max_len = self.get_maxlength(self.df[col].astype(str), col)
            if max_len <= 12:
                self.sheet.column_dimensions[letter].width = 12
            elif max_len <= 50:
                self.sheet.column_dimensions[letter].width = max_len + 2
            else:
                self.sheet.column_dimensions[letter].width = 50
                for cell in self.sheet[letter]:
                    cell.alignment = Alignment(horizontal='left',wrap_text=True)

    def set_width(self, col_name, width):
        # 提供调整列宽的接口
        index = list(self.df.columns).index(col_name)
        letter = chr(index+65)
        self.sheet.column_dimensions[letter].width = width
        self.user_def.append(col_name)

    def set_color(self, col_name, color, rule):
        # 提供设置颜色的接口,rule:规则函数
        index = list(self.df.columns).index(col_name)
        letter = chr(index+65)
        for cell in self.sheet[letter]:
            if rule(cell.value):
                cell.fill = PatternFill(fill_type="solid", start_color=color, end_color=color)

    def set_center_alignment(self, col_name):
        index = list(self.df.columns).index(col_name)
        letter = chr(index+65)
        for cell in self.sheet[letter]:
            cell.alignment = Alignment(wrap_text=True, horizontal='left')
    def save(self):
        # 自动调整列宽,并保存
        self.__auto_width()
        self.wb.save(self.filename)

def get_data_from_hive_1(sql):
    # 连接hive获取数据
    conn = hive.Connection(host='oser406436.cn.wal-mart.com',
                           port=10000,
                           username='svccnahahs',
                           database='cn_ec_wmt_dl_secure',
                           auth='KERBEROS',
                           kerberos_service_name='hive'
                           )
    result = pd.read_sql(sql, conn)
    return result

def get_data_from_hive_2(sql):
    # 连接hive获取数据
    conn = hive.Connection(host='oser406436.cn.wal-mart.com',
                           port=10000,
                           username='svccnahahs',
                           database='cn_ods_aloha_order',
                           auth='KERBEROS',
                           kerberos_service_name='hive'
                           )
    result = pd.read_sql(sql, conn)
    return result

def get_sql(yesterday_str):
    with open('./sql_file/check_flash_data.sql','r') as f:
        sql01 = f.read()
    sql01 = sql01 % (yesterday_str)
    with open('./sql_file/check_ec_data.sql','r') as f:
        sql02 = f.read()
    with open('./sql_file/check_ods_data.sql','r') as f:
        sql03 = f.read()
    with open('./sql_file/check_divide_data.sql','r') as f:
        sql04 = f.read()
    return sql01,sql02,sql03,sql04


def send_mail(smtp, from_addr, to_addrs, title, content, attachments):
    message = MIMEMultipart()
    # 邮件内容
    message.attach(MIMEText(content, 'plain', 'utf-8'))
    # 邮件标题
    message['Subject'] = Header(title, 'utf-8')
    message['To'] = ';'.join(to_addrs)

    # 添加附件
    if len(attachments) > 0:
        for att_file in attachments:
            att = MIMEText(open(att_file, 'rb').read(), 'base64', 'utf-8')
            att["Content-Type"] = 'application/octet-stream'
            att["Content-Disposition"] = "attachment; filename="+att_file
            message.attach(att)

    # 发送邮件
    try:
        smtpObj = smtplib.SMTP(host=smtp["host"], port=smtp["port"])
        smtpObj.sendmail(
            from_addr=from_addr,
            to_addrs=to_addrs,
            msg=message.as_string()
        )
        print("邮件发送成功")
    except smtplib.SMTPException:
        print("Error: 无法发送邮件")

def get_data(nowTime,yesterday_str):
    file_name = './{}_check_data.xlsx'.format(nowTime)
    rename_dic = {'t_type': '数据源', 'ts': '分区时间ts', 'count': '数据量',
        'upd_ts': '数据加载时间', 'quy_time': '查询时间', 'tb_name':'数据来源',
        'ts_t_2':'ts_t_2', 'etl_load_time_t_2':'etl_load_time_t_2', 'cnt_t_2':'cnt_t_2',
        'ts_t_1':'ts_t_1', 'etl_t_1':'etl_t_1', 'cnt_t_1':'cnt_t_1','per':'per','load_ts':'数据更新时间','upd_ts':'数据更新时间','ts':'分区时间ts'
    }

    if os.path.exists(file_name):  # 如果文件存在
    # 删除文件,可使用以下两种方法。
       os.remove(file_name)

    sql01,sql02,sql03,sql04 = get_sql(yesterday_str)
    print("开始获取第一模块数据....")
    result01 = get_data_from_hive_1(sql01)
    result01.rename(rename_dic,axis='columns',inplace=True)
    xlsx1 = XlsxSaver(result01, file_name , 'flashreport汇总表数据')
    xlsx1.set_center_alignment('cnt_t_2')
    xlsx1.set_center_alignment('per')
    xlsx1.set_center_alignment('数据更新时间')
    xlsx1.set_center_alignment('query_time')
    xlsx1.save()
    print("第一模块完成")

    print("开始获取第二模块数据......")
    result02 = get_data_from_hive_1(sql02)
    result02.rename(rename_dic,axis='columns',inplace=True)
    xlsx2 = XlsxSaver(result02, file_name , 'ec模型数据')
    xlsx2.set_center_alignment('数据更新时间')
    xlsx2.set_center_alignment('数据量')
    xlsx2.set_center_alignment('查询时间')
    xlsx2.save()

    print("开始获取第三模块数据......")
    result03 = get_data_from_hive_1(sql03)
    result03.rename(rename_dic,axis='columns',inplace=True)
    xlsx3 = XlsxSaver(result03, file_name , 'ods层源表数据')
    xlsx3.set_center_alignment('数据量')
    xlsx3.set_center_alignment('数据更新时间')
    xlsx3.set_center_alignment('查询时间')
    xlsx3.save()

    print("开始获取第四模块数据......")
    result04 = get_data_from_hive_2(sql04)
    result04.rename(rename_dic,axis='columns',inplace=True)
    xlsx4 = XlsxSaver(result04, file_name , 'flash四张表数据')
    xlsx4.set_center_alignment('数据量')
    xlsx4.set_center_alignment('数据更新时间')
    xlsx4.set_center_alignment('查询时间')
    xlsx4.save()

    print("数据获取完成")

def send_main(nowTime,yesterday):

    yesterday_str = yesterday.strftime('%Y-%m-%d')
    yesterday_month = yesterday.month
    yesterday_day = yesterday.day
    file_name = './{}_check_data.xlsx'.format(nowTime)

    path ='./{}_check_data.xlsx'.format(yesterday_str)  # 文件路径
    if os.path.exists(path):  # 如果文件存在
    # 删除文件,可使用以下两种方法。
       os.remove(path)
    #os.unlink(path)
    else:
       print('no such file:%s_check_data.xlsx' %yesterday_str)  # 则返回文件不存在

    smtp = {'host': "xxxcom", 'port': 25}
    from_addr = 'xxx@wal-mart.com'
    to_addrs=['xxx@sskjdata.com']
    title = '%s月%s日flashreport数据监控邮件' % (yesterday_month,yesterday_day)
    content = 'Hi all,\n     附件是%s月%s日flashreport数据监控,请查收。' % (yesterday_month,yesterday_day)
    attachments = [file_name]

    send_mail(smtp, from_addr, to_addrs, title, content, attachments)

@days_schedule("daily", [17,21,20])
def main():
    nowTime=datetime.now().strftime('%Y-%m-%d')
    print('nowTime:%s' % nowTime)
    yesterday = datetime.now()+relativedelta(days=-1)
    print('yesterday:%s' % yesterday)
    yesterday_str = yesterday.strftime('%Y-%m-%d')
    # yesterday = datetime.now()+relativedelta(days=-1)
    # yesterday_str = yesterday.strftime('%Y-%m-%d')
    print('yesterday_str:',yesterday_str)
    print('日期:',nowTime)
    get_data(nowTime,yesterday_str)
    print("开始发送邮件")
    send_main(nowTime,yesterday)
    print("邮件发送完成")


if __name__ == '__main__':
    Process(target=main).start()
  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值