python脚本
用于Teradata 装换成bigquery的DDL
包含了varchar、char、bigint、date、decimal转换
例如:
TD:
CREATE MULTISET TABLE XXXX.AAA ,FALLBACK,
NO BEFORE JOURNAL,
NO AFTER JOURNAL,
CHECKSUM=DEFAULT,
DEFAULT MERGEBLOCKRATO,
MAP=TD_MAP1
(
ID CHAR(5) CHARACTER SET LATIN CASESPCIFIC NOT NULL,
AMOUNT DECIMAL(23,3) COMPRESS 0.000,
NUM BIGINT NOT NULL,
DAY DATE FORMAT ‘yyyy-mm-dd’ NOT NULL,
NAME VARCHAR(250) CHARACTER SET LATIN CASESPECIFIC NOT NULL,
FLAG CHAR(1) CHARACTER SET LATIN CASESPECIFIC NOT NULL DEFAULT ‘Y’
)
PRIMARY INDEX(‘ID’);
BQ:
CREATE OR REPLACE TABLE YYYY.AAA(
ID STRING NOT NULL,
AMOUNT NUMERIC,
NUM INT64 NOT NULL,
DAY DATE FORMAT 'yyyy-m
DAY DATE NOT NULL,
NAME NOT NULL,
FLAG NOT NULL
)
;
import paramiko
import re
import csv
import time
import numpy as np
import pandas as pd
import os
def sql_del(TD_DDL,data)
#schema_td为TD原数据库名,schema_bq为BQ数据库名
schema_td='xxxx.'
schema_bq='yyyy.'
BQ_DDL=re.sub(r'(?ms),.*?\($',TD_DDL).replace('MULTISET','OR REPLACE').replace(schema_td,schema_bq)
index=re.findall(r'(?ms)PRIMARY INDEX.*?;$'|UNIQUE PRIMARY INDEX.*?;$',BQ_DDL)
BQ_DDL=re.sub(r'(?ms)PRIMARY INDEX.*?;$'|UNIQUE PRIMARY INDEX.*?;$',';',BQ_DDL)
table=re.findall(r'(?ms)XXXX\..*?\($',BQ_DDL)
table_name=str(table).replace('(','').replace('[','').replace(']','').replace('\'','')
if len(index)!=0:
mark={}
mark['table_name']=table_name
mark['default']='PRI'
mark['index']=str(index).replace('[','').replace(']','').replace('\'','')
data.append(mark)
#处理VARCHAR和CHAR类型
com=re.findall(r'(?ms)VARCHAR.*?\,$|CHAR.*?\,$|VARCHAR.*?\)$|CHAR.*?\)$',BQ_DDL)
for in in com:
mark={}
if i.find('DEFAULT')!=-1:
mark['table_name']=table_name
mark['default']=i
mark['index']=str(index).replace('[','').replace(']','').replace('\'','')
data.append(mark)
if i[-1]==')':
if i.find('NOT NULL')!=-1:
BQ_DDL=BQ_DDL.replace(i,'STRING NOT NULL)')
else:
BQ_DDL=BQ_DDL.replace(i,'STRING)')
else:
if i.find('NOT NULL')!=-1:
BQ_DDL=BQ_DDL.replace(i,'STRING NOT NULL,')
else:
BQ_DDL=BQ_DDL.replace(i,'STRING,')
#处理BIGINT类型
com=re.findall(r'(?ms)BIGINT.*?\,$|BIGINT.*?\)$',BQ_DDL)
for in in com:
mark={}
if i.find('DEFAULT')!=-1:
mark['table_name']=table_name
mark['default']=i
mark['index']=str(index).replace('[','').replace(']','').replace('\'','')
data.append(mark)
if i[-1]==')':
if i.find('NOT NULL')!=-1:
BQ_DDL=BQ_DDL.replace(i,'INT64 NOT NULL)')
else:
BQ_DDL=BQ_DDL.replace(i,'INT64 )')
else:
if i.find('NOT NULL')!=-1:
BQ_DDL=BQ_DDL.replace(i,'INT64 NOT NULL,')
else:
BQ_DDL=BQ_DDL.replace(i,'INT64 ,')
#处理DECIMAL类型
com=re.findall(r'(?ms)DECIMAL.*?\,$|DECIMAL.*?\)$',BQ_DDL)
for in in com:
mark={}
if i.find('DEFAULT')!=-1:
mark['table_name']=table_name
mark['default']=i
mark['index']=str(index).replace('[','').replace(']','').replace('\'','')
data.append(mark)
if i[-1]==')':
if i.find('NOT NULL')!=-1:
BQ_DDL=BQ_DDL.replace(i,'NUMERIC NOT NULL)')
else:
BQ_DDL=BQ_DDL.replace(i,'NUMERIC )')
else:
if i.find('NOT NULL')!=-1:
BQ_DDL=BQ_DDL.replace(i,'NUMERIC NOT NULL,')
else:
BQ_DDL=BQ_DDL.replace(i,'NUMERIC ,')
#处理DATE类型
com=re.findall(r'(?ms)DATE.*?\,$|DATE.*?\)$',BQ_DDL)
for in in com:
mark={}
if i.find('DEFAULT')!=-1:
mark['table_name']=table_name
mark['default']=i
mark['index']=str(index).replace('[','').replace(']','').replace('\'','')
data.append(mark)
if i[-1]==')':
if i.find('NOT NULL')!=-1:
BQ_DDL=BQ_DDL.replace(i,'DATE NOT NULL)')
else:
BQ_DDL=BQ_DDL.replace(i,'DATE)')
else:
if i.find('NOT NULL')!=-1:
BQ_DDL=BQ_DDL.replace(i,'DATE NOT NULL,')
else:
BQ_DDL=BQ_DDL.replace(i,'DATE,')
return BQ_DDL,table_name,data
#############main################
o=os.walk(r'要更换的TD文件的路径')
data=[]
#循环读取Windows制定目录下文件,并根据sql_del处理写入到存放路径
for path,dir_list,file_name in o:
for i in file_name:
reader=open(path+i,'r')
TD_DDL=reader.read()
BQ_DDL,table_name,data=sql_del(TD_DDL,data)
reader.close()
fo=open(r'生成BQ的DDL文件的存放路径')
fo.write(BQ_DDL)
fo.close()
default_mark=pd.DataFrame(data,columns=['table_name','default','index'])
#记录TD中原有的table对应的default默认值和index`在这里插入代码片`
default_mark.to_csv(r'把没有转换的default和index记录在csv文件中,这里为csv文件保存路径',index=False)
以上代码类型转换部分还可以优化,暂时先记录这些,后期再做优化