一个路径可能会产出多个表,1->n
输入:带sql的文件绝对路径,一行一个路径
输出:文件名\t依赖的表*
# coding=utf-8
import re
def getTable(shName,linesql):
search_index = 0
while len(linesql) > 12 and '_' in linesql and ('from ' in linesql or 'join ' in linesql):
# 从第一个from或join截取
if linesql.find('from ') >= 0 and linesql.find('join ') >= 0:
from_index = linesql.index('from ')
join_index = linesql.index('join ')
search_index = join_index if join_index < from_index else from_index
elif linesql.find('from ') >= 0:
search_index = linesql.index('from ')
elif linesql.find('join ') >= 0:
search_index = linesql.index('join ')
else:
return
# print(search_index)
linesql = linesql[search_index:]
# 判断是否空格后是表
pattern = re.compile(r'(?:from|join)\s+(\S+\_*)\s*')
linesqlli = pattern.findall(linesql)
for table in linesqlli:
if table.find('(') == -1 and table.find('_') >=0:
if table.find('.') == -1:
table = 'portal.'+table
# print(table.lower())
with open(r'D:\MyDesk\已迁移的文件依赖.txt', encoding='utf-8', mode='a') as rf:
rf.write(shName+'\t'+table.lower()+'\n')
else:continue
linesql = linesql[linesql.find(table)+len(table)+1:]
rf.close()
def catFile(filePath):
linestr = ''
# 读取每一行 合为一行
break_pattern = r'(\s|\n|\r|\t)+'
sh_name = 'portal.'+re.sub(r'D:\\GitProjects\\offlineexperiment\\dev\\hive.*\\sh_','',filePath).replace('.sh','').lower()
with open(filePath, encoding='utf-8', mode='r') as sf:
for line in sf:
line = re.sub(break_pattern,' ',line.replace('\n', ' '))
linestr += line
getTable(sh_name,linestr)
if __name__ == '__main__':
# 准备文件路径列表
sh_list = []
with open(r'D:\MyDesk\已迁移的文件.txt', encoding='utf-8', mode='r') as path_file:
for path in path_file:
sh_list.append(path.replace('\n', ''))
path_file.close()
for sh in sh_list:
catFile(sh)