import os
import re
# Doris连接
import pymysql
import configparser
import datetime
# 引入血缘模块sqllineage
from sqllineage.runner import LineageRunner
# Doris 配置
config = configparser.ConfigParser()
config.read('config2.ini')
def get_py_files(dir_path):
py_files = []
for root, dirs, files in os.walk(dir_path):
for file in files:
if file.endswith('.py'):
py_files.append(os.path.join(root, file))
return py_files
def get_sql_from_file(file_path):
with open(file_path, 'r') as f:
content = f.read()
sql_list = re.findall(r'SQL\s*=\s*"""(.+?)"""', content, re.DOTALL | re.IGNORECASE )
return sql_list
def main():
dir_path = '/Users/songhuaitang/Downloads/temp'
py_files = get_py_files(dir_path)
for py_file in py_files:
sql_list = get_sql_from_file(py_file)
for sql in sql_list:
# sql = "sql = \"\"\" "+sql+" \"\"\" "
print(sql)
pattern = r"(?i)AS\s*\(([^)]*\*\s*[^)]*)\)"
matches = re.finditer(pattern, sql, re.DOTALL)
# 匹配后的结果
db_params = {
'host': config.get('Database', 'host'),
'user': config.get('Database', 'user'),
'password': config.get('Database', 'password'),
'port': config.getint('Database', 'port'),
'database': config.get('Database', 'database')
}
conn = pymysql.connect(**db_params)
cursor = conn.cursor()
for match in matches:
match_str = match.group(1)
# with 临时表内的语句
print(match_str)
pattern_table = r"(?i)from\s+(\w+[\.\w+]*)"
match_table = re.search(pattern_table, match_str)
if match:
print(match_table.group(1) + '输出匹配结果') # 输出匹配结果
table_name = match_table.group(1)
# Doris 执行代码
query = '''
select concat("select " , GROUP_CONCAT(COLUMN_NAME)," from " , '{}' )
FROM information_schema.`columns`
WHERE concat(TABLE_SCHEMA,'.',TABLE_NAME) = '{}';
'''.format(table_name, table_name)
cursor.execute(query)
tables_get = cursor.fetchone()[0]
print(tables_get)
else:
print("没有匹配到结果")
pattern = r"\b(?i)from\s+(\w+)\b"
replaced = re.sub(pattern, r"(?i)from \1", match_str + "")
sql = sql.replace(match.group(), "AS (" + tables_get + ")")
# 替换后的 SQL 查询语句
print(sql)
# 血缘接入!!!
# 解析 SQL 查询语句,获取字段血缘关系
result = LineageRunner(sql)
column_lineage = result.print_column_lineage()
print("_____")
result.print_column_lineage()
print("_____")
# 将字段血缘关系写入 Doris 表中
if column_lineage is not None:
column_lineage = [d.replace("<default>.", "") for d in column_lineage]
for d in column_lineage:
parts = d.split(" <- ")
if len(parts) > 1:
if "." not in parts[1]:
parts[1] = "常量来源." + parts[1]
# print(parts[0],parts[-1],"____++")
# 不包含库名
# child_table, child_column = parts[0].replace("dds.","").replace("dwd.","").split(".")
# parent_table, parent_column = parts[-1].replace("dds.","").replace("dwd.","").split(".")
source_db, source_table, source_column = parts[0].split(".")
print(source_db, source_table, source_column)
sink_db, sink_table, sink_column = parts[-1].split(".")
print("___", source_db, source_table, source_column, sink_db, sink_table, sink_column, "___")
# 中间数据落库
cursor.execute("insert into detail.detail_usult_mid"
" values ('" + source_db + "','" + source_table + "','" + source_column + "','" + sink_db + "','" + sink_table + "','" + sink_column + "',CURRENT_TIMESTAMP(),CURRENT_TIMESTAMP()) ")
conn.commit()
# 关闭连接
cursor.close()
conn.close()
if __name__ == '__main__':
main()
Doris血缘落表
最新推荐文章于 2024-05-10 16:38:29 发布