Pyodps 脚本集合

liuyunyun1314520

已于 2023-11-13 13:30:32 修改

阅读量575

点赞数

文章标签： python odps

于 2023-11-10 16:42:00 首次发布

本文链接：https://blog.csdn.net/liuyunyun1314520/article/details/134336518

版权

ODPS日常python运维脚本集合

1、获取我们odps项目中的表详细信息,用于数据编目等业务。
2、获取我们项目中的表数据量等基本信息，用于数据对账等等。

Python表字段信息获取

Sql建表语句

create table if not exists `stg_table_comment_all_df` 
(
    `table_schema` string comment '表空间名称'
    ,`table_name` string comment '表名称'
    ,`table_comment` string comment '表注释'
    ,`col_name` string comment '列名称'
    ,`col_type` string comment '列类型'
    ,`col_comment` string comment '列注释'
)
comment 'odsp_表字段信息获取'
partitioned by 
(
    dt string comment '分区'
)
lifecycle 30
;

代码示例

# -*- coding: utf-8 -*-
from odps import ODPS
import datetime

# python3 将下面5行删除即可
import sys
import importlib
# 防止中文注释乱码
importlib.reload(sys)
sys.setdefaultencoding("utf8")

# odps 项目定义
o = ODPS('MDCAy***yMiN', 'aV2*****CpJS',
         'MC_CY_YXGL_SOURCE_****',
         'http://service.cn-shanghai-shyp-d01.odps.ops.dataportal.shyp.gov.cn/api')
# 定义表名筛选的前后缀
dt_str = (datetime.datetime.now() + datetime.timedelta(days=-1)).strftime('%Y%m%d')
ODS_PREFIX = ''
wd = []
# 定义项目空间 多个空间就写多个
project = ['mc_cy_yxgl_source_01', 'mc_cy_yxgl_dwd_01']
# 循环我们的项目空间一个一个来
for pro in project:
    ods_table_list = o.list_tables(pro)
    for t in ods_table_list:
        table_name = t.name
        # print (table_name)
        # 获取表注释
        table_comment = t.comment
        cols = t.schema.columns
        for col in cols:
            col_name = col.name
            col_type = col.type
            col_comment = col.comment
            # 将表结构写入结果表
            wd.append([pro, table_name, table_comment, col_name, str(col_type), col_comment])

sta_table = o.get_table("stg_table_comment_all_df")
sta_table.delete_partition('dt=%s' % dt_str, if_exists=True)
with sta_table.open_writer(partition=('dt=%s' % dt_str), create_partition=True) as writer:
    writer.write(wd)

Python数据量获取脚本

Sql建表语句

CREATE TABLE IF NOT EXISTS stg_table_statistics_all_df
(
    table_schema              STRING COMMENT '表空间名称',
    table_name                STRING COMMENT '表名',
    partition_name            STRING COMMENT '最新分区',
    chinese_name              STRING COMMENT '中文表名',
    column_count              BIGINT COMMENT '字段数量',
    column_comment_null_count BIGINT COMMENT '字段注释缺失数量',
    pt_count                  BIGINT COMMENT '分区数量',
    data_count                BIGINT COMMENT '最新分区数据量',
    star_time                 STRING COMMENT '开始时间',
    end_time                  STRING COMMENT '结束时间'
) 
COMMENT '获取数据表情况统计'
PARTITIONED BY
(
    dt                        STRING
)
LIFECYCLE 180;

代码示例

# -*- coding: utf-8 -*-
from odps import ODPS
import datetime
import re

#python3 将下面5行删除即可
import sys
import importlib
# 防止中文注释乱码
importlib.reload(sys)
sys.setdefaultencoding("utf8")

# odps 项目定义
o = ODPS('MDCAy***PyMiN', 'aV2zYceNK8i****vzmRCpJS',
         'MC_CY_YXGL_SOURCE_****',
         'http://service.cn-shanghai-shyp-d01.odps.ops.dataportal.shyp.gov.cn/api')
# 控制分区
dt_str = (datetime.datetime.now() + datetime.timedelta(days=-1)).strftime('%Y%m%d')
# 控制数据过滤
rid_list = []
# 正则控制 数据过滤
pattern = ''
# 结果数组
wd = []
# 定义项目空间 多个空间就写多个
project = ['mc_cy_yxgl_source_01', 'mc_cy_yxgl_dwd_01', 'mc_cy_yxgl_dws_01', 'mc_cy_yxgl_dwb_01','mc_cy_yxgl_yqfk']
# 循环我们的项目空间一个一个来
for pro in project:
    ods_table_list = o.list_tables(pro)
    for t in ods_table_list:
        table_name = t.name
        # print (table_name)
        # 过滤非业务表
        if not re.match(pattern, table_name):
            # print (table_name)
            continue
        # 过滤利旧数据表
        if table_name in rid_list:
            continue
        chinese_name = t.comment
        cs = [c for c in t.schema.columns]
        # 字段数量
        column_count = len(cs)
        # 字段注释缺失数量
        column_comment_null_count = 0
        for c in cs:
            if c.comment == '' or c.comment == 'null':
                column_comment_null_count += 1
        # 执行SQL
        cnt_sql = ''
        # 分区
        new_pt = ''
        #开始时间
        star_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        # 拿出我们没用分区的表
        if len(t.schema.partitions) == 0:
            # 拼接出我们要执行的sql
            cnt_sql = "select count(1) from %s " % (pro + '.' + table_name)
        # 拿出有分区的数据表
        else:
            pi = t.iterate_partitions()
            # 进入分区循环
            ps = [p for p in pi]
            pt_count = len(ps)
            #存在分区字段 但是并没有分区
            if (pt_count == 0):
                cnt_sql = "select 0"
            else:
                new_pt = str(ps[-1])
                if ',' in new_pt:  # 多级分区的情况
                    new_pt = new_pt.replace(',', ' and ')
                    cnt_sql = "select count(1) from %s where %s" % (pro + '.' + table_name, new_pt)
                else : # 单级分区
                    pt_count = 1
                    cnt_sql = "select count(1) from %s " % (pro + '.' + table_name)
        with o.execute_sql(cnt_sql).open_reader() as reader:
            data_count = reader[0][0]
            end_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            wd.append([pro, table_name, str(new_pt), chinese_name, column_count, column_comment_null_count,
                       pt_count, data_count,star_time,end_time])

sta_table = o.get_table("stg_table_statistics_all_df")
sta_table.delete_partition('dt=%s' % dt_str, if_exists=True)
with sta_table.open_writer(partition=('dt=%s' % dt_str), create_partition=True) as writer:
    writer.write(wd)

Pyodps 获取项目上下游信息

Sql建表语句

create table if not exists stg_meta_instances_df 
(
    ins_name string comment '实例名'
    ,start_tm datetime comment '开始时间'
    ,end_tm datetime comment '结束时间'
    ,cost_tm bigint comment '总耗时（秒）'
    ,status string comment '实例状态'
    ,ins_owner string comment '作者'
    ,tsk_name string comment '子任务'
    ,tbl_in string comment '输入表（以,分割）'
    ,tbl_out string comment '输出表（以,分割）'
    ,etl_tm datetime comment 'ETL时间'
)
comment 'OPDS血源信息表'
partitioned by 
(
    dt string comment '按日期分区'
)
lifecycle 30
;

#!/usr/local/anaconda3/bin/python3
# -*- coding: utf-8 -*-
# ###########################################################################################
# 执行环境：分析服务器
# 脚本: get_instance_meta.py
# 调度: 每日早6点调度
# 日志: get_instance_meta.log
# ###########################################################################################

import os
import re
from datetime import datetime, date, time, timedelta
from odps import ODPS
from odps.models import Schema, Column, Partition

start_tm = datetime.now()
today_min = datetime.combine(date.today(), time.min)
# cur_path = os.path.split(os.path.realpath(__file__))[0]

to_table = 'stg_meta_instances_df'
odps = ODPS('MDCAy5vFzEDPyMiN', 'aV2zYceNK8iNrn31VXZPH9vzmRCpJS',
         'MC_CY_YXGL_SOURCE_01',
         'http://service.cn-shanghai-shyp-d01.odps.ops.dataportal.shyp.gov.cn/api')

columns = [Column(name='ins_name', type='string', comment='实例名'),
           Column(name='start_tm', type='datetime', comment='开始时间'),
           Column(name='end_tm', type='datetime', comment='结束时间'),
           Column(name='cost_tm', type='bigint', comment='总耗时（秒）'),
           Column(name='status', type='string', comment='实例状态'),
           Column(name='ins_owner', type='string', comment='作者'),
           Column(name='tsk_name', type='string', comment='子任务'),
           Column(name='tbl_in', type='string', comment='输入表（以,分割）'),
           Column(name='tbl_out', type='string', comment='输出表（以,分割）'),
           Column(name='etl_tm', type='datetime', comment='ETL时间')]
partitions = [Partition(name='pt', type='string', comment='按日期分区')]
schema = Schema(columns=columns, partitions=partitions)

records = []
try:
    for ins in odps.list_instances(start_time=today_min,
                                   end_time=start_tm,
                                   only_owner=False,
                                   status='Terminated'):
        tsk_name_filter = [re.match('console_query_task', tsk) for tsk in ins.get_task_names()]
        try:
            tsk_output_filter = [ins.get_task_summary(tsk) if not ins.get_task_summary(tsk)
                                 else ins.get_task_summary(tsk).get('Outputs')
                                 for tsk in ins.get_task_names()]
        except:
            continue
        else:
            pass
        # 这里过滤了没有输入表、输出表的实例。这段代码初衷就是提取表间依赖关系，所以没有考虑所有实例
        if ins.is_successful() and any(tsk_name_filter) and any(tsk_output_filter):
            start_time = ins.start_time + timedelta(hours=8)
            end_time = ins.end_time + timedelta(hours=8)
            tbl_in = set()
            tbl_out = set()
            for tsk in ins.get_task_names():
                smy = ins.get_task_summary(tsk)
                tbl_in.update([re.match(r'^[\w].*\.([\w]+).*$', key).group(1) for key in smy['Inputs'].keys()])
                tbl_out.update([re.match(r'^[\w].*\.([\w]+).*$', key).group(1) for key in smy['Outputs'].keys()])
            records.append([ins.name,
                            start_time.strftime('%Y-%m-%d %H:%M:%S'),
                            end_time.strftime('%Y-%m-%d %H:%M:%S'),
                            (end_time - start_time).seconds,
                            ins.status.value.lower(),
                            ins.owner.split(':')[-1],
                            ','.join(ins.get_task_names()) if ins.get_task_names() else None,
                            ','.join(tbl_in) if tbl_in else None,
                            ','.join(tbl_out) if tbl_out else None,
                            datetime.now().strftime('%Y-%m-%d %H:%M:%S')])               
    partition = '%s=%s' % (partitions[0].name, start_tm.strftime('%Y%m%d'))
    to_tbl = odps.create_table(to_table, schema, if_not_exists=True)
    to_tbl.delete_partition(partition, if_exists=True)
    odps.write_table(to_table, records, partition=partition, create_partition=True)

except:
    status = 'failed'
    n = 0
else:
    status = 'succeed'
    n = len(records)

# end_tm = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# log = {'status': status, 'n': n, 'start': start_tm, 'end': end_tm}
# f = open(os.path.join(cur_path, 'get_field_meta.log'), 'a')
# f.write("Update {status} with {n} instances from {start} to {end}\n".format(**log))
# f.close()

Python定期备份Mysql

代码示例

#!/usr/bin/python
# -*- coding: UTF-8 -*-

# 先安装PyMySQL模块 pip install PyMySQL

# 忽略产生的警告是信息
import warnings
warnings.filterwarnings('ignore')

# pyMysql CPython>= 2.6 or >= 3.3
import pymysql
# 导入 os 这个模块，因为要在 shell 中执行
import os
import time

mysql_host = "127.0.0.1"
mysql_user = "root"
mysql_pwd = "root"
mysql_port = 3306
mysql_charset = "utf8"
new_date = time.strftime("%Y%m%d%H%M%S")
back_path = "/data/mysql_back/"
out_time = 7*24*60*60     # 指定多少秒前的数据删除

try:
    if(not(os.path.exists(back_path))):
        os.makedirs(back_path)

    # 获取一个数据库连接，注意如果是UTF-8类型的，需要制定数据库
    conn = pymysql.connect(
        host=mysql_host,            # 数据库地址
        user=mysql_user,                 # 数据库用户名
        passwd=mysql_pwd,               # 数据库密码
        db='mysql',            # 数据名
        port=mysql_port,                   # 数据库访问端口
        charset=mysql_charset               # 数据库编码格式
    )
    cur = conn.cursor()              # 获取一个游标
    cur.execute('show databases')    # 查询出所有数据库
    data = cur.fetchall()            # 查询出来，并赋值 data
    for db_names in data:
        for db_name in db_names:
            if(db_name=='information_schema' or db_name=='performance_schema' or db_name=='mysql'):
                continue
            if(not(os.path.exists(back_path+db_name))):
                os.makedirs(back_path+db_name)
            path = back_path+db_name+"/"+new_date+".sql"   # 数据库备份路径
            os.system("mysqldump -h%s -u%s -p%s %s > %s" % (mysql_host, mysql_user, mysql_pwd, db_name, path))
            old_time = time.strftime("%Y%m%d",time.gmtime(time.time()-out_time))
            os.system("rm -f %s*.sql" % (back_path+db_name+"/"+old_time))

    cur.close()                      # 关闭游标
    conn.close()                     # 释放数据库资源

except Exception: print("查询失败")

执行操作

# 安装下必要库 环境要求
yum -y install gcc gcc-c++ make zlib* openssl openssl-devel openssl-static -y
yum install python-setuptools -y
# 或者 yum install python3-setuptools -y
easy_install pip
# 或者 easy_install3 pip
# 安装PyMySQL
pip install PyMySQL

/data/shell/mysql_back.py 执行脚本
#添加Linux定时任务命令
crontab -e
#进去添加下边代码
*/15 * * * * /data/shell/mysql_back.py > /data/shell/logs.log 2>&1 &
#定时时间格式    脚本路径/脚本
#:wq保存退出即可