oracle数据库控制
# -*- coding:utf-8 -*-
# 忽略警告信息
import warnings
warnings.simplefilter(action = "ignore", category = FutureWarning)
warnings.filterwarnings("ignore")
import re
import pandas as pd
import numpy as np
import itertools
import cx_Oracle
from sqlalchemy import create_engine
class control_db():
'''
控制数据库
'''
def __init__(self,str_link):
self.erSql = open('./error_sql.txt','w',encoding='utf-8')
self.str_link = str_link
if self.str_link:
self.connect(str_link)
self.sql_cnt = itertools.count(1)
def __del__(self):
self.erSql.close()
if self.str_link:
self.curs.close()
self.conn.commit()
self.conn.close()
def connect(self,str_link):
# 链接数据库
str_ls = re.split('[/@:]',str_link)
uname = str_ls[0]
pwd = str_ls[1]
ip = str_ls[2]
port = str_ls[3]
tnsname = str_ls[4]
dsnStr = cx_Oracle.makedsn(ip, port, service_name=tnsname)
self.conn = cx_Oracle.connect(uname, pwd, dsnStr)
self.curs = self.conn.cursor()
# self.conn.commit()
self.engine = create_engine('oracle://%s:%s@%s' %(uname, pwd, dsnStr), encoding='utf-8')
def exeSql(self,sql):
try:
self.curs.execute(sql)
if mod(next(self.sql_cnt),5000) == 0:
self.conn.commit()
except Exception as e:
print('Error:',repr(e),sql)
def exeSqls(self,sqls,log=False):
# 执行sql语句列表,写错误文件
sqls = [sql.split() for sql in sql.split(';')]
for sql in sqls:
self.exeSql(sql,log)
def select(self,sql):
ret = pd.read_sql(sql,con=self.engine);
print (ret);
return ret;
def exeFile(self,file,encoding='utf-8'):
with open(file,'r',encoding=encoding) as file:
sql = file.read()
sqls = [sql.strip() for sql in re.split(';+',sql)[:-1]]
self.exeSqls(sqls)
def getTbls(self,user):
'''
取用户的所有表名
'''
sql = ''' SELECT TABLE_NAME FROM ALL_TABLES WHERE OWNER = '%s' '''%user
tbls = pd.read_sql(sql,con=self.engine)['table_name'].to_list()
return tbls
def getCols(self,tbname):
'''
获取表的列名
'''
sql = ''' SELECT COLUMN_NAME FROM USER_COL_COMMENTS WHERE TABLE_NAME = '%s' '''%tbname
cols = pd.read_sql(sql,con=self.engine)['column_name'].to_list()
return cols
def getcols2(self,sql,synonym):
# 获取列名,加上表别名显示
ls = re.split(r'[,,]',synonym)
left = ls[0]
right = ls[1]
tbl = pd.read_sql(sql,con=self.engine);
if left and right:
ret = '\n,'.join(['%s.%s = %s.%s'%(left,name,right,name) for name in tbl.columns])
elif left:
ret = '\n,'.join(['%s.%s '%(left,name) for name in tbl.columns])
else:
ret = '\n,'.join(tbl.columns)
## ret = ret+'\n\n'
return ret
def dropTbls(self,tbls,ktbls=[]):
'''
批量删除表
'''
sqls = ['DROP TABLE "%s"'%tbname for tbname in tbls if tbname not in ktbls]
self.exeSqls(sqls)
def addDatPartition(self,tbname,fmt='PT_%Y%m'):
'''
对按月分区的表增加月分区到当前日期
'''
# 读取最大分区名,按月份
sql = 'select HIGH_VALUE from user_tab_partitions where table_name = \'%s\''%tbname
maxval = pd.read_sql(sql,con=self.engine)['high_value'].sort_values(ascending=False).iloc[0]
maxdate = pd.Timestamp(re.search(r'(\d+-\d+-\d+)',maxval).group(1))+DateOffset(months=1)
# 最大时间到现在的序列
opt_df = pd.DataFrame(pd.period_range(start=maxdate, end=pd.Timestamp.now(),freq='M')[1:].rename('high_value'))
if opt_df.empty:
print('不需要增加分区')
return -1
opt_df['high_value'] = opt_df['high_value'].map(lambda x:x.start_time)
opt_df['pt_name'] = opt_df['high_value'].map(lambda x:x.strftime(fmt))
opt_df['high_value'] = (opt_df['high_value']+DateOffset(months=1)).astype(str)
# 生成sql语句
sql = ''' ALTER TABLE {tbname} ADD PARTITION {pt_name} VALUES LESS THAN (TO_DATE('{high_value}','yyyy-mm-dd')) '''
opt_df['sql'] = ''
for idx,row in opt_df.iterrows():
opt_df.loc[idx,'sql'] = sql.format(
tbname = tbname,
pt_name = opt_df.loc[idx,'pt_name'],
high_value = opt_df.loc[idx,'high_value']
)
# 执行sql
self.exeSqls(opt_df['sql'])
def initDB(self,user,create_path,datpar_tbls,ktbls=[]):
'''
初始化数据库
'''
print('=== 开始初始化数据库\n')
# 删除表
tbls = self.getTbls(user)
self.dropTbls(tbls,ktbls)
self.dropTbls(ktbls)
# 新建表
self.exeFile(create_path)
# 加时间分区
for datpar_tbl in datpar_tbls:
self.addDatPartition(datpar_tbl)
def obj2inSql(self,tbname,obj):
'''
行转换为插入语句
'''
base_sql = ''' INSERT INTO "{tbname}" ({cols}) VALUES ({vals}) '''
cols = []
vals = []
for k,v in obj.items():
# 字段
cols.append('"%s"'%k)
# 值
v = str(v)
if (v.upper() == 'NULL') or ('TO_DATE' in v.upper()):
vals.append(v)
else:
vals.append('\'%s\''%v)
# 组合生成
cols = ','.join(cols)
vals = ','.join(vals)
sql = base_sql.format(tbname=tbname,cols=cols,vals=vals)
return sql
def obj2upSql(self,tbname,obj,uids):
'''
把行转换为更新语句
'''
# base_sql = ''' UPDATE "{tbname}" SET ({cols}) VALUES ({vals}) WHERE {condi} '''
base_sql = ''' UPDATE "{tbname}" SET {kvals} WHERE {condi} '''
kvals = []
for k,v in obj.items():
if k in uids:
continue
if (v.upper() == 'NULL') or ('TO_DATE' in v.upper()):
kvals.append('"%s" = %s'%(k,v))
else:
kvals.append('"%s" = \'%s\''%(k,v))
# 条件
condi = []
for uid in uids:
c = '"%s" = \'%s\''%(uid,obj.get(uid))
condi.append(c)
# 组合生成
kvals = ','.join(kvals)
condi = ' and '.join(condi)
sql = base_sql.format(tbname=tbname,kvals=kvals,condi=condi)
return sql
def __fmtFrame(self,frame,fields=None,dat_fields=None,rename=None):
# 格式化
def toDate(x):
try:
x = x.strftime('%Y-%m-%d')
except Exception as e:
pass
return x
if dat_fields:
frame[dat_fields] = frame[dat_fields].astype(np.datetime64).apply(lambda se:se.map(lambda x:toDate(x)))
frame[dat_fields] = frame[dat_fields].apply(lambda se:se.map(lambda x:"TO_DATE('%s','yyyy-mm-dd')"%x))
if fields:
frame = frame[fields]
if rename:
frame = frame.rename(columns=rename)
frame = frame.replace(['',' ',None,np.nan],'NULL')
return frame.astype(np.str)
def frame2inSql(self,tbname,frame,fields=None,dat_fields=None,rename=None):
'''
插入数据,可指定字段对齐数据库,用rename更改为数据库的名字
'''
# 格式化
frame = self.__fmtFrame(frame,fields,dat_fields,rename)
frame['etl_date'] = "to_date('%s','yyyy-mm-dd hh24:mi:ss')"%pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
frame.columns = frame.columns.map(lambda x:x.upper())
# 迭代行写出sql
for idx,row in frame.iterrows():
sql = self.obj2inSql(tbname,row.to_dict())
yield sql+';\n'
def build_model(self,tbls):
tmps = []
for tbl in tbls:
# 读表
sql = f'select * from {tbl} where rownum < 5'
df = pd.read_sql(sql,con=self.engine)
# 判断维度
dim_list = []
for col in df.columns:
if isinstance(df[col][0],np.float) or isinstance(df[col][0],np.int) :
continue
dim_list.append(col)
# 重组
cols = df.columns
tmp = pd.DataFrame(data=1.0,index= cols[~cols.isin(dim_list)],columns = cols[cols.isin(dim_list)])
tmps.append(tmp)
# 合并过滤
result = pd.concat(tmps,axis=0)
result = result.groupby(result.index).min()
return result
def write_out(self,des):
'''
把库的表写到文件
'''
# 获取表名
tb_names = pd.read_sql('select table_name from user_tables',con=db.engine).ix[:,0]
# 迭代表名
for tb_name in tb_names:
# 判断行数
cnt = pd.read_sql(f'select count(1) from {tb_name}',con=db.engine).ix[0,0]
if cnt < 5:
continue
# 新建目录
if not os.path.exists(des):
os.mkdir(des)
# 分块读取
rts = pd.read_sql(f'select * from {tb_name}',con=db.engine,chunksize=10000)
it_nums = itertools.count(1)
file = open(os.path.join(des,f'{tb_name}.csv'),'w',encoding='utf-8-sig')
for rt in rts:
# 写入流,判断是否写字段
strio = StringIO()
num = next(it_nums)
if num == 1:
rt.to_csv(strio,index=False,header=True)
else:
rt.to_csv(strio,index=False,header=False)
# 写入文件
rt_str = strio.getvalue().replace('\r\n','\n')
file.write(rt_str)
# 完成一个文件
file.close()
print(tb_name,'is save')
if __name__=='__main__':
dcb = control_db('investment/z123@10.10.10.10:1521/orcl')