ODPS日常python运维脚本集合
1、获取我们odps项目中的表详细信息,用于数据编目等业务。
2、获取我们项目中的表数据量等基本信息,用于数据对账等等。
Python表字段信息获取
Sql建表语句
create table if not exists `stg_table_comment_all_df`
(
`table_schema` string comment '表空间名称'
,`table_name` string comment '表名称'
,`table_comment` string comment '表注释'
,`col_name` string comment '列名称'
,`col_type` string comment '列类型'
,`col_comment` string comment '列注释'
)
comment 'odsp_表字段信息获取'
partitioned by
(
dt string comment '分区'
)
lifecycle 30
;
代码示例
# -*- coding: utf-8 -*-
from odps import ODPS
import datetime
# python3 将下面5行删除即可
import sys
import importlib
# 防止中文注释乱码
importlib.reload(sys)
sys.setdefaultencoding("utf8")
# odps 项目定义
o = ODPS('MDCAy***yMiN', 'aV2*****CpJS',
'MC_CY_YXGL_SOURCE_****',
'http://service.cn-shanghai-shyp-d01.odps.ops.dataportal.shyp.gov.cn/api')
# 定义表名筛选的前后缀
dt_str = (datetime.datetime.now() + datetime.timedelta(days=-1)).strftime('%Y%m%d')
ODS_PREFIX = ''
wd = []
# 定义项目空间 多个空间就写多个
project = ['mc_cy_yxgl_source_01', 'mc_cy_yxgl_dwd_01']
# 循环我们的项目空间一个一个来
for pro in project:
ods_table_list = o.list_tables(pro)
for t in ods_table_list:
table_name = t.name
# print (table_name)
# 获取表注释
table_comment = t.comment
cols = t.schema.columns
for col in cols:
col_name = col.name
col_type = col.type
col_comment = col.comment
# 将表结构写入结果表
wd.append([pro, table_name, table_comment, col_name, str(col_type), col_comment])
sta_table = o.get_table("stg_table_comment_all_df")
sta_table.delete_partition('dt=%s' % dt_str, if_exists=True)
with sta_table.open_writer(partition=('dt=%s' % dt_str), create_partition=True) as writer:
writer.write(wd)
Python数据量获取脚本
Sql建表语句
CREATE TABLE IF NOT EXISTS stg_table_statistics_all_df
(
table_schema STRING COMMENT '表空间名称',
table_name STRING COMMENT '表名',
partition_name STRING COMMENT '最新分区',
chinese_name STRING COMMENT '中文表名',
column_count BIGINT COMMENT '字段数量',
column_comment_null_count BIGINT COMMENT '字段注释缺失数量',
pt_count BIGINT COMMENT '分区数量',
data_count BIGINT COMMENT '最新分区数据量',
star_time STRING COMMENT '开始时间',
end_time STRING COMMENT '结束时间'
)
COMMENT '获取数据表情况统计'
PARTITIONED BY
(
dt STRING
)
LIFECYCLE 180;
代码示例
# -*- coding: utf-8 -*-
from odps import ODPS
import datetime
import re
#python3 将下面5行删除即可
import sys
import importlib
# 防止中文注释乱码
importlib.reload(sys)
sys.setdefaultencoding("utf8")
# odps 项目定义
o = ODPS('MDCAy***PyMiN', 'aV2zYceNK8i****vzmRCpJS',
'MC_CY_YXGL_SOURCE_****',
'http://service.cn-shanghai-shyp-d01.odps.ops.dataportal.shyp.gov.cn/api')
# 控制分区
dt_str = (datetime.datetime.now() + datetime.timedelta(days=-1)).strftime('%Y%m%d')
# 控制数据过滤
rid_list = []
# 正则控制 数据过滤
pattern = ''
# 结果数组
wd = []
# 定义项目空间 多个空间就写多个
project = ['mc_cy_yxgl_source_01', 'mc_cy_yxgl_dwd_01', 'mc_cy_yxgl_dws_01', 'mc_cy_yxgl_dwb_01','mc_cy_yxgl_yqfk']
# 循环我们的项目空间一个一个来
for pro in project:
ods_table_list = o.list_tables(pro)
for t in ods_table_list:
table_name = t.name
# print (table_name)
# 过滤非业务表
if not re.match(pattern, table_name):
# print (table_name)
continue
# 过滤利旧数据表
if table_name in rid_list:
continue
chinese_name = t.comment
cs = [c for c in t.schema.columns]
# 字段数量
column_count = len(cs)
# 字段注释缺失数量
column_comment_null_count = 0
for c in cs:
if c.comment == '' or c.comment == 'null':
column_comment_null_count += 1
# 执行SQL
cnt_sql = ''
# 分区
new_pt = ''
#开始时间
star_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# 拿出我们没用分区的表
if len(t.schema.partitions) == 0:
# 拼接出我们要执行的sql
cnt_sql = "select count(1) from %s " % (pro + '.' + table_name)
# 拿出有分区的数据表
else:
pi = t.iterate_partitions()
# 进入分区循环
ps = [p for p in pi]
pt_count = len(ps)
#存在分区字段 但是并没有分区
if (pt_count == 0):
cnt_sql = "select 0"
else:
new_pt = str(ps[-1])
if ',' in new_pt: # 多级分区的情况
new_pt = new_pt.replace(',', ' and ')
cnt_sql = "select count(1) from %s where %s" % (pro + '.' + table_name, new_pt)
else : # 单级分区
pt_count = 1
cnt_sql = "select count(1) from %s " % (pro + '.' + table_name)
with o.execute_sql(cnt_sql).open_reader() as reader:
data_count = reader[0][0]
end_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
wd.append([pro, table_name, str(new_pt), chinese_name, column_count, column_comment_null_count,
pt_count, data_count,star_time,end_time])
sta_table = o.get_table("stg_table_statistics_all_df")
sta_table.delete_partition('dt=%s' % dt_str, if_exists=True)
with sta_table.open_writer(partition=('dt=%s' % dt_str), create_partition=True) as writer:
writer.write(wd)
Pyodps 获取项目上下游信息
Sql建表语句
create table if not exists stg_meta_instances_df
(
ins_name string comment '实例名'
,start_tm datetime comment '开始时间'
,end_tm datetime comment '结束时间'
,cost_tm bigint comment '总耗时(秒)'
,status string comment '实例状态'
,ins_owner string comment '作者'
,tsk_name string comment '子任务'
,tbl_in string comment '输入表(以,分割)'
,tbl_out string comment '输出表(以,分割)'
,etl_tm datetime comment 'ETL时间'
)
comment 'OPDS血源信息表'
partitioned by
(
dt string comment '按日期分区'
)
lifecycle 30
;
#!/usr/local/anaconda3/bin/python3
# -*- coding: utf-8 -*-
# ###########################################################################################
# 执行环境:分析服务器
# 脚本: get_instance_meta.py
# 调度: 每日早6点调度
# 日志: get_instance_meta.log
# ###########################################################################################
import os
import re
from datetime import datetime, date, time, timedelta
from odps import ODPS
from odps.models import Schema, Column, Partition
start_tm = datetime.now()
today_min = datetime.combine(date.today(), time.min)
# cur_path = os.path.split(os.path.realpath(__file__))[0]
to_table = 'stg_meta_instances_df'
odps = ODPS('MDCAy5vFzEDPyMiN', 'aV2zYceNK8iNrn31VXZPH9vzmRCpJS',
'MC_CY_YXGL_SOURCE_01',
'http://service.cn-shanghai-shyp-d01.odps.ops.dataportal.shyp.gov.cn/api')
columns = [Column(name='ins_name', type='string', comment='实例名'),
Column(name='start_tm', type='datetime', comment='开始时间'),
Column(name='end_tm', type='datetime', comment='结束时间'),
Column(name='cost_tm', type='bigint', comment='总耗时(秒)'),
Column(name='status', type='string', comment='实例状态'),
Column(name='ins_owner', type='string', comment='作者'),
Column(name='tsk_name', type='string', comment='子任务'),
Column(name='tbl_in', type='string', comment='输入表(以,分割)'),
Column(name='tbl_out', type='string', comment='输出表(以,分割)'),
Column(name='etl_tm', type='datetime', comment='ETL时间')]
partitions = [Partition(name='pt', type='string', comment='按日期分区')]
schema = Schema(columns=columns, partitions=partitions)
records = []
try:
for ins in odps.list_instances(start_time=today_min,
end_time=start_tm,
only_owner=False,
status='Terminated'):
tsk_name_filter = [re.match('console_query_task', tsk) for tsk in ins.get_task_names()]
try:
tsk_output_filter = [ins.get_task_summary(tsk) if not ins.get_task_summary(tsk)
else ins.get_task_summary(tsk).get('Outputs')
for tsk in ins.get_task_names()]
except:
continue
else:
pass
# 这里过滤了没有输入表、输出表的实例。这段代码初衷就是提取表间依赖关系,所以没有考虑所有实例
if ins.is_successful() and any(tsk_name_filter) and any(tsk_output_filter):
start_time = ins.start_time + timedelta(hours=8)
end_time = ins.end_time + timedelta(hours=8)
tbl_in = set()
tbl_out = set()
for tsk in ins.get_task_names():
smy = ins.get_task_summary(tsk)
tbl_in.update([re.match(r'^[\w].*\.([\w]+).*$', key).group(1) for key in smy['Inputs'].keys()])
tbl_out.update([re.match(r'^[\w].*\.([\w]+).*$', key).group(1) for key in smy['Outputs'].keys()])
records.append([ins.name,
start_time.strftime('%Y-%m-%d %H:%M:%S'),
end_time.strftime('%Y-%m-%d %H:%M:%S'),
(end_time - start_time).seconds,
ins.status.value.lower(),
ins.owner.split(':')[-1],
','.join(ins.get_task_names()) if ins.get_task_names() else None,
','.join(tbl_in) if tbl_in else None,
','.join(tbl_out) if tbl_out else None,
datetime.now().strftime('%Y-%m-%d %H:%M:%S')])
partition = '%s=%s' % (partitions[0].name, start_tm.strftime('%Y%m%d'))
to_tbl = odps.create_table(to_table, schema, if_not_exists=True)
to_tbl.delete_partition(partition, if_exists=True)
odps.write_table(to_table, records, partition=partition, create_partition=True)
except:
status = 'failed'
n = 0
else:
status = 'succeed'
n = len(records)
# end_tm = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
# log = {'status': status, 'n': n, 'start': start_tm, 'end': end_tm}
# f = open(os.path.join(cur_path, 'get_field_meta.log'), 'a')
# f.write("Update {status} with {n} instances from {start} to {end}\n".format(**log))
# f.close()
Python定期备份Mysql
代码示例
#!/usr/bin/python
# -*- coding: UTF-8 -*-
# 先安装PyMySQL模块 pip install PyMySQL
# 忽略产生的警告是信息
import warnings
warnings.filterwarnings('ignore')
# pyMysql CPython>= 2.6 or >= 3.3
import pymysql
# 导入 os 这个模块,因为要在 shell 中执行
import os
import time
mysql_host = "127.0.0.1"
mysql_user = "root"
mysql_pwd = "root"
mysql_port = 3306
mysql_charset = "utf8"
new_date = time.strftime("%Y%m%d%H%M%S")
back_path = "/data/mysql_back/"
out_time = 7*24*60*60 # 指定多少秒前的数据删除
try:
if(not(os.path.exists(back_path))):
os.makedirs(back_path)
# 获取一个数据库连接,注意如果是UTF-8类型的,需要制定数据库
conn = pymysql.connect(
host=mysql_host, # 数据库地址
user=mysql_user, # 数据库用户名
passwd=mysql_pwd, # 数据库密码
db='mysql', # 数据名
port=mysql_port, # 数据库访问端口
charset=mysql_charset # 数据库编码格式
)
cur = conn.cursor() # 获取一个游标
cur.execute('show databases') # 查询出所有数据库
data = cur.fetchall() # 查询出来,并赋值 data
for db_names in data:
for db_name in db_names:
if(db_name=='information_schema' or db_name=='performance_schema' or db_name=='mysql'):
continue
if(not(os.path.exists(back_path+db_name))):
os.makedirs(back_path+db_name)
path = back_path+db_name+"/"+new_date+".sql" # 数据库备份路径
os.system("mysqldump -h%s -u%s -p%s %s > %s" % (mysql_host, mysql_user, mysql_pwd, db_name, path))
old_time = time.strftime("%Y%m%d",time.gmtime(time.time()-out_time))
os.system("rm -f %s*.sql" % (back_path+db_name+"/"+old_time))
cur.close() # 关闭游标
conn.close() # 释放数据库资源
except Exception: print("查询失败")
执行操作
# 安装下必要库 环境要求
yum -y install gcc gcc-c++ make zlib* openssl openssl-devel openssl-static -y
yum install python-setuptools -y
# 或者 yum install python3-setuptools -y
easy_install pip
# 或者 easy_install3 pip
# 安装PyMySQL
pip install PyMySQL
/data/shell/mysql_back.py 执行脚本
#添加Linux定时任务命令
crontab -e
#进去添加下边代码
*/15 * * * * /data/shell/mysql_back.py > /data/shell/logs.log 2>&1 &
#定时时间格式 脚本路径/脚本
#:wq保存退出即可