Spark血缘字段解析

明月清风，良宵美酒

已于 2022-01-21 15:42:32 修改

阅读量3.2k

点赞数 9

分类专栏：大数据技术栈文章标签： spark scala big data

于 2021-09-14 10:41:10 首次发布

本文链接：https://blog.csdn.net/qq_40822132/article/details/120280539

版权

大数据技术栈专栏收录该内容

72 篇文章 32 订阅

订阅专栏

Spark血缘字段解析

一、废话不多说，直接上代码
二、把项目打成jar包，并和spark集成
三、本人对该项目的改造
- - 1.项目结构
四、最后的清洗结果

一、废话不多说，直接上代码

package com.roundyuan.sparkagent

import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.catalog.{CatalogTable, HiveTableRelation}
import org.apache.spark.sql.catalyst.expressions.NamedExpression
import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Deduplicate, DeserializeToObject, Distinct, Filter, Generate, GlobalLimit, InsertIntoTable, Join, LocalLimit, LocalRelation, LogicalPlan, MapElements, MapPartitions, Project, Repartition, RepartitionByExpression, SerializeFromObject, SubqueryAlias, TypedFilter, Union, Window}
import org.apache.spark.sql.execution.QueryExecution
import org.apache.spark.sql.execution.datasources.{InsertIntoHadoopFsRelationCommand, LogicalRelation}
import org.apache.spark.sql.hive.execution.{CreateHiveTableAsSelectCommand, InsertIntoHiveTable}
import org.apache.spark.sql.util.QueryExecutionListener

import scala.collection.mutable
import scala.collection.mutable.{ListBuffer, Map}
import scala.util.control.NonFatal

/**
 * auth:zhangmengyuan
 * 手动获取解析计划为 explain extended
 * 该方法为通过解析逻辑执行计划获取字段间的血缘关系
 *
 */

class FlQueryExecutionListener extends QueryExecutionListener with Logging {
  // 目标表应该只有一张
  private val targetTable: Map[Long, String] = Map()
  // source表 可能有多个
  private val sourceTables: Map[Long, String] = Map()
  // 字段执行过程的关系
  private val fieldProcess: Map[Long, mutable.Set[Long]] = Map()
  // 压缩后的血缘关系 只记录source表到 target表
  private val fieldLineage: Map[String, mutable.Set[String]] = mutable.Map();
  // SQL类型 考虑、insert select、create as
  private var processType: String = ""

  override def onSuccess(funcName: String, qe: QueryExecution, durationNs: Long): Unit = withErrorHandling(qe) {
    // scuess exec logic plan exec
    lineageParser(qe)
  }

  override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = withErrorHandling(qe) {

  }

  private def withErrorHandling(qe: QueryExecution)(body: => Unit): Unit = {
    try
      body
    catch {
      case NonFatal(e) =>
        val ctx = qe.sparkSession.sparkContext
        logError(s"Unexpected error occurred during lineage processing for application: ${ctx.appName} #${ctx.applicationId}", e)
    }
  }

  def lineageParser(qe: QueryExecution): Unit = {
    logInfo("----------- field lineage parse start --------")
    // 针对做解析且将 source表及结果表记录
    val analyzedLogicPlan = qe.analyzed
    resolveLogicV2(analyzedLogicPlan)
    // 关系连接
    connectSourceFieldAndTargetField()
    println(fieldLineage)
  }

  /**
   *
   * @param plan
   */
  def resolveLogicV2(plan: LogicalPlan): Unit = {
    // 获取原始表从 LogicalRelation 或 HiveTableRelation 目标表从 InsertIntoHiveTable 和 CreateHiveTableAsSelectCommand
    // 获取转换过程从Aggregate 和 Project
    plan.collect {
      case plan: LogicalRelation => {
        val calalogTable = plan.catalogTable.get
        val tableName = calalogTable.database + "." + calalogTable.identifier.table
        plan.output.foreach(columnAttribute => {
          val columnFullName = tableName + "." + columnAttribute.name
          sourceTables += (columnAttribute.exprId.id -> columnFullName)
        })
      }
      case plan: HiveTableRelation => {
        val tableName = plan.tableMeta.database + "." + plan.tableMeta.identifier.table
        plan.output.foreach(columnAttribute => {
          val columnFullName = tableName + "." + columnAttribute.name
          sourceTables += (columnAttribute.exprId.id -> columnFullName)
        })
      }
      case plan: InsertIntoHiveTable => {
        val tableName = plan.table.database + "." + plan.table.identifier.table
        extTargetTable(tableName, plan.query)
      }
      case plan: InsertIntoHadoopFsRelationCommand=>{
        val catalogTable: CatalogTable = plan.catalogTable.get
        val tableName=catalogTable.database+"."+catalogTable.identifier.table
        extTargetTable(tableName, plan.query)
      }
      case plan: CreateHiveTableAsSelectCommand => {
        val tableName = plan.tableDesc.database + "." + plan.tableDesc.identifier.table
        extTargetTable(tableName, plan.query)
      }
      case plan: Aggregate => {
        plan.aggregateExpressions.foreach(aggItem => {
          extFieldProcess(aggItem)
        }
        )
      }
      case plan: Project => {
        plan.projectList.toList.foreach {
          pojoItem => {
            extFieldProcess(pojoItem)
          }
        }
      }
      //      case `plan` => logInfo("******child plan******:\n" + plan)
    }
  }

  def extFieldProcess(namedExpression: NamedExpression): Unit = {
    //alias 存在转换关系 不然就是原本的值
    if ("alias".equals(namedExpression.prettyName)) {
      val sourceFieldId = namedExpression.exprId.id
      val targetFieldIdSet: mutable.Set[Long] = fieldProcess.getOrElse(sourceFieldId, mutable.Set.empty)
      namedExpression.references.foreach(attribute => {
        targetFieldIdSet += attribute.exprId.id
      })
      fieldProcess += (sourceFieldId -> targetFieldIdSet)
    }
  }

  def extTargetTable(tableName: String, plan: LogicalPlan): Unit = {
    logInfo("start ext target table")
    plan.output.foreach(columnAttribute => {
      val columnFullName = tableName + "." + columnAttribute.name
      targetTable += (columnAttribute.exprId.id -> columnFullName)
    })
  }

  /**
   * 从过程中提取血缘：目标表 字段循环是否存在于 source表中，不存在的话从过程中寻找直到遇见目标表
   */
  def connectSourceFieldAndTargetField(): Unit = {
    val fieldIds = targetTable.keySet
    fieldIds.foreach(fieldId => {
      val resTargetFieldName = targetTable(fieldId)
      val resSourceFieldSet: mutable.Set[String] = mutable.Set.empty[String]
      if (sourceTables.contains(fieldId)) {
        val sourceFieldId = sourceTables.getOrElse(fieldId, "")
        resSourceFieldSet += sourceFieldId
      } else {
        val targetIdsTmp = findSourceField(fieldId)
        resSourceFieldSet ++= targetIdsTmp
      }
      fieldLineage += (resTargetFieldName -> resSourceFieldSet)
    })
  }

  def findSourceField(fieldId: Long): mutable.Set[String] = {
    val resSourceFieldSet: mutable.Set[String] = mutable.Set.empty[String]
    if (fieldProcess.contains(fieldId)) {
      val fieldIds: mutable.Set[Long] = fieldProcess.getOrElse(fieldId, mutable.Set.empty)
      fieldIds.foreach(fieldId => {
        if (sourceTables.contains(fieldId)) {
          resSourceFieldSet += sourceTables(fieldId)
        } else {
          val sourceFieldSet = findSourceField(fieldId)
          resSourceFieldSet ++= sourceFieldSet
        }
      })
    }
    resSourceFieldSet
  }
}

测试代码

package com.roundyuan.sparkagent

import org.apache.spark.sql.SparkSession

object TestHive {
  def main(args: Array[String]): Unit = {
    // 连接hive数据仓库
    val sparkSession = SparkSession.builder()
      .config("hive.metastore.uris", "thrift://localhost:9083")
      .appName("HiveCaseJob")
      .master("local[*]")
      .enableHiveSupport()
      .getOrCreate()
    val listenV3 = new FlQueryExecutionListener()
    sparkSession.listenerManager.register(listenV3)
    //sparkSession.sql("show databases").show()
    sparkSession.sql("insert into test.test_orc select id,count(name) as num from test.test02 group by id").show()
    //val user_log = sparkSession.sql("select * from dbtaobao.user_log").collect()
    //val test = user_log.map(row => "user_id"+row(0))
    //test.map(row => println(row))

  }
}

pom.xml文件

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.roundyuan</groupId>
    <artifactId>roundyuan-spark-lineage</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
        <scala.version>2.11.12</scala.version>
        <scala.binary.version>2.11</scala.binary.version>
        <spark.version>2.4.7</spark.version>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_${scala.binary.version}</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_${scala.binary.version}</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <dependency>
            <groupId>org.codehaus.janino</groupId>
            <artifactId>commons-compiler</artifactId>
            <version>3.0.16</version>
<!--            <scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-hive_${scala.binary.version}</artifactId>
            <version>${spark.version}</version>
<!--            <scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.scala-lang</groupId>
            <artifactId>scala-library</artifactId>
            <version>${scala.version}</version>
        </dependency>
    </dependencies>
    <build>
        <plugins>
            <plugin>
                <groupId>org.scala-tools</groupId>
                <artifactId>maven-scala-plugin</artifactId>
                <version>2.15.2</version>
                <executions>
                    <execution>
                        <goals>
                            <goal>compile</goal>
                            <goal>testCompile</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
</project>

二、把项目打成jar包，并和spark集成

1.先打包 mvn clean install
2.将jar包放到 spark的jar下
3.在spark-default.conf 配置 spark.sql.execution.arrow.enabled true spark.sql.queryExecutionListeners com.roundyuan.sparkagent.FlQueryExecutionListener
4.执行sql会打印出字段的依赖关系

三、本人对该项目的改造

1.项目结构

在这里插入图片描述
<1>、resources中主要包含五个文件(core-site.xml、druid.properties、hdfs-site.xml、hive-site.xml、yarn-site.xml);
<2>、entity是我的实体类

package com.roundyuan.sparkagent.entity;

import lombok.Data;
import org.joda.time.DateTime;

@Data
public class TableLineageInfo{
    private String SourceTableNme;
    private String SourceDatabaseNme;
    private String SourceColNme;
    private String TargetTableNme;
    private String TargetDatabaseNme;
    private String TargetColNme;
    private DateTime updateDate;
}

<3>、utils工具类包
代码如下:

package com.roundyuan.sparkagent.utils;

import java.sql.*;

public class DruidInsert {
    public static void insert (String SourceTableNme,String SourceDatabaseNme,String SourceColNme,String TargetTableNme,String TargetDatabaseNme,String TargetColNme){
        Connection conn =null;
        PreparedStatement pst = null;
        if(DruidSelect.select(TargetTableNme,TargetColNme)==false) {
            try {
                //给数据库day1 中的user表 添加一组数据
               // conn = JdbcUtil.getConnection();
                String url = "隐私，需要保密";
//换成自己PostgreSQL数据库实例所在的ip地址，并设置自己的端口 S
                String user = "postgres";
                String password = "123456";
//在这里我的密码为空，读者可以自己选择是否设置密码
                try {
                    Class.forName("org.postgresql.Driver");
                } catch (ClassNotFoundException e) {
                    e.printStackTrace();
                }
                conn= DriverManager.getConnection(url, user, password);
                String sql = "insert into TableLineageInfo(SourceTableNme,SourceDatabaseNme,SourceColNme,TargetTableNme,TargetDatabaseNme,TargetColNme,updateDate) values(?,?,?,?,?,?,?)";
                //获取操作sql对象
                pst = conn.prepareStatement(sql);
                //给？赋值
                pst.setString(1, SourceTableNme);
                pst.setString(2, SourceDatabaseNme);
                pst.setString(3, SourceColNme);
                pst.setString(4, TargetTableNme);
                pst.setString(5, TargetDatabaseNme);
                pst.setString(6, TargetColNme);
                pst.setTimestamp(7, new Timestamp(new java.util.Date().getTime()));
                //执行

                //pst.addBatch();
                int i = pst.executeUpdate();
               // System.out.println(i);
            } catch (SQLException e) {
               // e.printStackTrace();
            } finally {
                //释放资源
                //JdbcUtil.close(pst, conn);
                try {
                    conn.close();
                    pst.close();
                } catch (SQLException e) {
                    e.printStackTrace();
                }
            }

        }else{
            DruidUpdate.update(SourceTableNme,SourceDatabaseNme,SourceColNme,TargetTableNme,TargetDatabaseNme,TargetColNme);
        }

    }
}

package com.roundyuan.sparkagent.utils;

import java.sql.*;

public class DruidSelect {
    public static Boolean select (String TargetTableNme,String TargetColNme){
        Connection conn =null;
        PreparedStatement pst = null;
        try {
            //给数据库day1 中的user表 添加一组数据
           // conn = JdbcUtil.getConnection();
            String url = "隐私，需要保密";
//换成自己PostgreSQL数据库实例所在的ip地址，并设置自己的端口 S
            String user = "postgres";
            String password = "123456";
//在这里我的密码为空，读者可以自己选择是否设置密码
            try {
                Class.forName("org.postgresql.Driver");
            } catch (ClassNotFoundException e) {
                e.printStackTrace();
            }
            conn= DriverManager.getConnection(url, user, password);
            String sql ="select * from TableLineageInfo where TargetTableNme=? and TargetColNme=?";
            //获取操作sql对象
            pst = conn.prepareStatement(sql);
            //给？赋值
            pst.setString(1,TargetTableNme);
            pst.setString(2,TargetColNme);
            //执行
            //System.out.println(pst.execute());
            pst.execute();

        } catch (SQLException e) {
            e.printStackTrace();
        }finally {
            //释放资源
            //JdbcUtil.close(pst,conn);
            try {
                conn.close();
                pst.close();
            } catch (SQLException e) {
                e.printStackTrace();
            }

        }
        return false;
    }
}

package com.roundyuan.sparkagent.utils;

import java.sql.*;
public class DruidUpdate {
        public static void update (String SourceTableNme,String SourceDatabaseNme,String SourceColNme,String TargetTableNme,String TargetDatabaseNme,String TargetColNme){
            Connection conn =null;
            PreparedStatement pst = null;
            try {
                //修改一组数据
              //  conn = JdbcUtil.getConnection();
                String url = "隐私，需要保密";
//换成自己PostgreSQL数据库实例所在的ip地址，并设置自己的端口 S
                String user = "postgres";
                String password = "123456";
//在这里我的密码为空，读者可以自己选择是否设置密码
                try {
                    Class.forName("org.postgresql.Driver");
                } catch (ClassNotFoundException e) {
                    e.printStackTrace();
                }
                conn= DriverManager.getConnection(url, user, password);
                String sql ="UPDATE TableLineageInfo SET SourceTableNme = ?,SourceDatabaseNme=?,SourceColNme=?,TargetTableNme=?,TargetDatabaseNme=?,TargetColNme=?,updateDate=? WHERE TargetTableNme=?,TargetDatabaseNme=?";
                //获取操作sql对象
                pst = conn.prepareStatement(sql);
                //给？赋值
                pst.setString(1, SourceTableNme);
                pst.setString(2, SourceDatabaseNme);
                pst.setString(3, SourceColNme);
                pst.setString(4, TargetTableNme);
                pst.setString(5, TargetDatabaseNme);
                pst.setString(6, TargetColNme);
                pst.setTimestamp(7, new Timestamp(new java.util.Date().getTime()));
                pst.setString(8, TargetTableNme);
                pst.setString(9, TargetDatabaseNme);
                //执行
                System.out.println("***************更新了*****************");
                pst.execute();

            } catch (SQLException e) {
                e.printStackTrace();
            }finally {
                //释放资源
                try {
                    conn.close();
                    pst.close();
                } catch (SQLException e) {
                    e.printStackTrace();
                }
            }
        }
}

<4>、FlQueryExecutionListener是我的核心类，手动获取解析计划为 explain extended，该方法为通过解析逻辑执行计划获取字段间的血缘关系

package com.roundyuan.sparkagent

import com.roundyuan.sparkagent.utils.DruidInsert
import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.catalog.{CatalogTable, HiveTableRelation}
import org.apache.spark.sql.catalyst.expressions.NamedExpression
import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan, Project}
import org.apache.spark.sql.execution.QueryExecution
import org.apache.spark.sql.execution.datasources.{InsertIntoHadoopFsRelationCommand, LogicalRelation}
import org.apache.spark.sql.hive.execution.{CreateHiveTableAsSelectCommand, InsertIntoHiveTable}
import org.apache.spark.sql.util.QueryExecutionListener

import scala.collection.mutable
import scala.collection.mutable.Map
import scala.util.control.NonFatal

/**
 * auth:zhangmengyuan
 * 手动获取解析计划为 explain extended
 * 该方法为通过解析逻辑执行计划获取字段间的血缘关系
 *
 */

class FlQueryExecutionListener extends QueryExecutionListener with Logging {
  private val  conn= null
  private val pst= null
  // 目标表应该只有一张
  private val targetTable: Map[Long, String] = Map()
  // source表 可能有多个
  private val sourceTables: Map[Long, String] = Map()
  // 字段执行过程的关系
  private val fieldProcess: Map[Long, mutable.Set[Long]] = Map()
  // 压缩后的血缘关系 只记录source表到 target表
  private val fieldLineage: Map[String, mutable.Set[String]] = mutable.Map();
  // SQL类型 考虑、insert select、create as
  private var processType: String = ""

  override def onSuccess(funcName: String, qe: QueryExecution, durationNs: Long): Unit = withErrorHandling(qe) {
    // scuess exec logic plan exec
    lineageParser(qe)
  }

  override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = withErrorHandling(qe) {

  }

  private def withErrorHandling(qe: QueryExecution)(body: => Unit): Unit = {
    try
      body
    catch {
      case NonFatal(e) =>
        val ctx = qe.sparkSession.sparkContext
        logError(s"Unexpected error occurred during lineage processing for application: ${ctx.appName} #${ctx.applicationId}", e)
    }
  }

  def lineageParser(qe: QueryExecution): Unit = {
    logInfo("----------- field lineage parse start --------")
    // 针对做解析且将 source表及结果表记录
    val analyzedLogicPlan = qe.analyzed
    resolveLogicV2(analyzedLogicPlan)
    // 关系连接
    connectSourceFieldAndTargetField()
//    println(fieldLineage)
    for(i<- fieldLineage){
      if(i.toString().split(",")(1).length!=6){
        //println(i.toString().split(",")(0).substring(1)+"<-"+i.toString().split(",")(1).substring(4,i.toString().split(",")(1).length-2))
        var aa = i.toString().toLowerCase().split(",")(0).substring(1)
        var tdb = aa.split("\\.")(0)
        var ttb = aa.split("\\.")(1)
        var tfd = aa.split("\\.")(2)

        var bb = i.toString().toLowerCase().split(",")(1).substring(4,i.toString().split(",")(1).length-2)
        var sdb = bb.split("\\.")(0)
        var stb = bb.split("\\.")(1)
        var sfd = bb.split("\\.")(2)


        DruidInsert.insert(stb,sdb,sfd,ttb,tdb,tfd)

      }
    }
//    println(fieldLineage)
  }

  /**
   *
   * @param plan
   */
  def resolveLogicV2(plan: LogicalPlan): Unit = {
    // 获取原始表从 LogicalRelation 或 HiveTableRelation 目标表从 InsertIntoHiveTable 和 CreateHiveTableAsSelectCommand
    // 获取转换过程从Aggregate 和 Project
    plan.collect {
      case plan: LogicalRelation => {
        val calalogTable = plan.catalogTable.get
        val tableName = calalogTable.database + "." + calalogTable.identifier.table
        plan.output.foreach(columnAttribute => {
          val columnFullName = tableName + "." + columnAttribute.name
          sourceTables += (columnAttribute.exprId.id -> columnFullName)
        })
      }
      case plan: HiveTableRelation => {
        val tableName = plan.tableMeta.database + "." + plan.tableMeta.identifier.table
        plan.output.foreach(columnAttribute => {
          val columnFullName = tableName + "." + columnAttribute.name
          sourceTables += (columnAttribute.exprId.id -> columnFullName)
        })
      }
      case plan: InsertIntoHiveTable => {
        val tableName = plan.table.database + "." + plan.table.identifier.table
        extTargetTable(tableName, plan.query)
      }
      case plan: InsertIntoHadoopFsRelationCommand=>{
        val catalogTable: CatalogTable = plan.catalogTable.get
        val tableName=catalogTable.database+"."+catalogTable.identifier.table
        extTargetTable(tableName, plan.query)
      }
      case plan: CreateHiveTableAsSelectCommand => {
        val tableName = plan.tableDesc.database + "." + plan.tableDesc.identifier.table
        extTargetTable(tableName, plan.query)
      }
      case plan: Aggregate => {
        plan.aggregateExpressions.foreach(aggItem => {
          extFieldProcess(aggItem)
        }
        )
      }
      case plan: Project => {
        plan.projectList.toList.foreach {
          pojoItem => {
            extFieldProcess(pojoItem)
          }
        }
      }
      //      case `plan` => logInfo("******child plan******:\n" + plan)
    }
  }

  def extFieldProcess(namedExpression: NamedExpression): Unit = {
    //alias 存在转换关系 不然就是原本的值
    if ("alias".equals(namedExpression.prettyName)) {
      val sourceFieldId = namedExpression.exprId.id
      val targetFieldIdSet: mutable.Set[Long] = fieldProcess.getOrElse(sourceFieldId, mutable.Set.empty)
      namedExpression.references.foreach(attribute => {
        targetFieldIdSet += attribute.exprId.id
      })
      fieldProcess += (sourceFieldId -> targetFieldIdSet)
    }
  }

  def extTargetTable(tableName: String, plan: LogicalPlan): Unit = {
    logInfo("start ext target table")
    plan.output.foreach(columnAttribute => {
      val columnFullName = tableName + "." + columnAttribute.name
      targetTable += (columnAttribute.exprId.id -> columnFullName)
    })
  }

  /**
   * 从过程中提取血缘：目标表 字段循环是否存在于 source表中，不存在的话从过程中寻找直到遇见目标表
   */
  def connectSourceFieldAndTargetField(): Unit = {
    val fieldIds = targetTable.keySet
    fieldIds.foreach(fieldId => {
      val resTargetFieldName = targetTable(fieldId)
      val resSourceFieldSet: mutable.Set[String] = mutable.Set.empty[String]
      if (sourceTables.contains(fieldId)) {
        val sourceFieldId = sourceTables.getOrElse(fieldId, "")
        resSourceFieldSet += sourceFieldId
      } else {
        val targetIdsTmp = findSourceField(fieldId)
        resSourceFieldSet ++= targetIdsTmp
      }
      fieldLineage += (resTargetFieldName -> resSourceFieldSet)
    })
  }

  def findSourceField(fieldId: Long): mutable.Set[String] = {
    val resSourceFieldSet: mutable.Set[String] = mutable.Set.empty[String]
    if (fieldProcess.contains(fieldId)) {
      val fieldIds: mutable.Set[Long] = fieldProcess.getOrElse(fieldId, mutable.Set.empty)
      fieldIds.foreach(fieldId => {
        if (sourceTables.contains(fieldId)) {
          resSourceFieldSet += sourceTables(fieldId)
        } else {
          val sourceFieldSet = findSourceField(fieldId)
          resSourceFieldSet ++= sourceFieldSet
        }
      })
    }
    resSourceFieldSet
  }
}

<5>、TestHive测试类

package com.roundyuan.sparkagent

import org.apache.spark.sql.SparkSession

object TestHive {
  def main(args: Array[String]): Unit = {
    // 连接hive数据仓库
    val sparkSession = SparkSession.builder()
      .config("hive.metastore.uris", "thrift://master1:9083,thrift://slave1:9083")
      .config("HiveDb","ods.db,dcl.db,dw.db,mdp.db,ads.db")
      .config("HdfsPath","hdfs://nameservice1:8020/user/hive/warehouse/")
      .config("Hdfs.dfs.nameservices","nameservice1")
      .config("Hdfs.dfs.ha.namenodes.nameservice1","namenode424,namenode442")
      .config("Hdfs.dfs.namenode.rpc-address.nameservice1.namenode424","master1:8020")
      .config("Hdfs.namenode.rpc-address.nameservice1.namenode442","master2:8020")
      .config("spark.sql.warehouse.dir","hdfs://nameservice1/user/hive/warehouse")
      .appName("HiveCaseJob")
      .master("local[*]")
      .enableHiveSupport()
      .getOrCreate()
    val listenV3 = new FlQueryExecutionListener()
    sparkSession.listenerManager.register(listenV3)
    //sparkSession.sql("show databases").show()
    sparkSession.sql("drop table tmp.t7")
    sparkSession.sql("create table tmp.t8 As select * from mdp.mdp_cusdm_social limit 10").show()
//    val a_1 = sparkSession.sql("select * from tmp.t3 limit 10").show()
//    val a_2 = sparkSession.sql("select * from tmp.t4").toDF()
//    val a_3 = a_1.join(a_2,a_1("client_oneid")===a_2("client_oneid"),"left")
//    a_3.createOrReplaceTempView("tt")
//    sparkSession.sql(
//      """
//        |create table tmp.tt_1 as select * from tt
//      """.stripMargin).show()

//    val user_log = sparkSession.sql("select * from mdp.mdp_cusdm_nature limit 100").collect()
//    val test = user_log.map(row => "user_id"+row(0))
//    test.map(row => println(row))

  }
}

<6>、pom.xml文件

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.roundyuan</groupId>
    <artifactId>roundyuan-spark-lineage</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
        <scala.version>2.11.12</scala.version>
        <scala.binary.version>2.11</scala.binary.version>
        <spark.version>2.4.0</spark.version>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_${scala.binary.version}</artifactId>
            <version>${spark.version}</version>
            <exclusions>
                <exclusion>
                    <artifactId>hadoop-client</artifactId>
                    <groupId>org.apache.hadoop</groupId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-sql_${scala.binary.version}</artifactId>
            <version>${spark.version}</version>
            <exclusions>
                <exclusion>
                    <artifactId>scala-reflect</artifactId>
                    <groupId>org.scala-lang</groupId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>org.codehaus.janino</groupId>
            <artifactId>commons-compiler</artifactId>
            <version>3.0.16</version>
<!--            <scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-hive_${scala.binary.version}</artifactId>
            <version>${spark.version}</version>
<!--            <scope>provided</scope>-->
        </dependency>
        <dependency>
            <groupId>org.scala-lang</groupId>
            <artifactId>scala-library</artifactId>
            <version>${scala.version}</version>
        </dependency>
        <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
            <version>1.16.10</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/com.alibaba/druid -->
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>druid</artifactId>
            <version>1.2.6</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/mysql/mysql-connector-java -->
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>8.0.16</version>
        </dependency>
        <dependency>
            <groupId>org.postgresql</groupId>
            <artifactId>postgresql</artifactId>
            <version>42.1.1</version>
        </dependency>
        <!--&lt;!&ndash;&lt;!&ndash;hadoop&ndash;&gt;&ndash;&gt;-->
        <!--<dependency>-->
            <!--<groupId>org.apache.hadoop</groupId>-->
            <!--<artifactId>hadoop-common</artifactId>-->
            <!--<version>3.0.0</version>-->
        <!--</dependency>-->
        <dependency>
            <groupId>log4j</groupId>
            <artifactId>log4j</artifactId>
            <version>1.2.17</version>
        </dependency>
    </dependencies>
    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-surefire-plugin</artifactId>
                <version>2.12.4</version>
                <configuration>
                    <skipTests>true</skipTests>
                </configuration>
            </plugin>
            <!-- java编译插件 -->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.8.1</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                    <encoding>UTF-8</encoding>
                </configuration>
            </plugin>
            <!-- Scala打包插件 -->
            <plugin>
                <groupId>net.alchim31.maven</groupId>
                <artifactId>scala-maven-plugin</artifactId>
                <version>3.2.2</version>
                <executions>
                    <execution>
                        <id>eclipse-add-source</id>
                        <goals>
                            <goal>add-source</goal>
                        </goals>
                    </execution>
                    <execution>
                        <id>scala-compile-first</id>
                        <goals>
                            <goal>compile</goal>
                        </goals>
                    </execution>
                    <execution>
                        <id>scala-test-compile-first</id>
                        <goals>
                            <goal>testCompile</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
            <!-- java打包插件 -->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <version>3.2.1</version>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <!--是否将没有使用的类在打包的时候直接排除掉-->
                            <minimizeJar>true</minimizeJar>
                            <shadedArtifactAttached>true</shadedArtifactAttached>
                            <!--打的包含依赖的jar包名字的后缀，默认是shaded-->
                            <shadedClassifierName>dep</shadedClassifierName>
                            <artifactSet>
                                <includes>
                                    <!-- Include here the dependencies you
                                        want to be packed in your fat jar -->
                                    <include>*:*</include>
                                </includes>
                            </artifactSet>
                            <filters>
                                <filter>
                                    <artifact>*:*</artifact>
                                    <excludes>
                                        <exclude>META-INF/*.SF</exclude>
                                        <exclude>META-INF/*.DSA</exclude>
                                        <exclude>META-INF/*.RSA</exclude>
                                    </excludes>
                                </filter>
                            </filters>
                            <transformers>
                                <transformer
                                        implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
                                    <resource>reference.conf</resource>
                                </transformer>
                            </transformers>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
</project>

四、最后的清洗结果

在这里插入图片描述
我的清洗结果表主要有7个字段(SourceTableNme->源表名,SourceDatabaseNme->源库名,SourceColNme->源列名,TargetTableNme->目标表名,TargetDatabaseNme->目标库名,TargetColNme->目标列名,UpdateDate->更新时间)